Apply function with different sources in R - r

I'm having trouble using the apply function and repeatedly get an error about different sources.
I believe that both sources are data.frames, so I can't work out why it doesn't like the apply inputs.
I would like to get to the output_desired without the need to run them individually. I appreciate I could likely do this with a loop, but am trying to learn some need tricks.
TLDR; I am trying to group the input_s into an object, and then run that dataframe through the function(result) and rbind the outputs.
library(dplyr)
library(tidyr)
## Inputs ##
input_1 = structure(list(V1 = c("Team_2022", "Team_2022", "Team_2022"), V2 = c("Frank", "Mary", "John"), V3 = c("Sydney", "Sydney", "Sydney"), V4 = c(55, 76, 14)), row.names = c(NA, -3L), class = c("data.table", "data.frame"))
input_2 = structure(list(V1 = c("Team_2023", "Team_2023", "Team_2023"), V2 = c("Bill", "Mary", "John"), V3 = c("Sydney", "Sydney", "Sydney"), V4 = c(113, 23, 10)), row.names = c(NA, -3L), class = c("data.table", "data.frame"))
input_3 = structure(list(V1 = c("Team_2024", "Team_2024", "Team_2024"), V2 = c("Frank", "Mary", "Bill"), V3 = c("Sydney", "Sydney", "Sydney"), V4 = c(7, 19, 52)), row.names = c(NA, -3L), class = c("data.table", "data.frame"))
input_4 = structure(list(V1 = c("Team_2025", "Team_2025", "Team_2025"), V2 = c("Frank", "Mary", "John"), V3 = c("Sydney", "Sydney", "Sydney"), V4 = c(46, 44, 88)), row.names = c(NA, -3L), class = c("data.table", "data.frame"))
## Teams ##
teams = structure(list(V1 = c("team1", "team2", "team3"), V2 = c("Mary + Frank","Mary + John", "Mary + Bill")), class = "data.frame", row.names = c(NA, -3L))
## Group the inputs into one ##
all_objects = ls()
input_objects = grep("^input", all_objects, value = T)
input_test = as.data.frame(input_obj)
## Function ##
result = function(input, teams) {
data = teams %>%
separate_rows(V2) %>%
left_join(input, by = c("V2" = "V2")) %>%
replace_na(list(V4 = 0)) %>%
group_by(V1.x) %>% fill(V1.y, V3) %>%
summarize(V1.y = first(V1.y),
V2 = paste(V2, collapse = " + "),
V3 = first(V3),
V4 = sum(V4))
return(data)
}
## Outputs individually ##
output_1 = result(input_1, teams)
output_2 = result(input_2, teams)
output_3 = result(input_3, teams)
output_4 = result(input_4, teams)
## Join outputs ##
output_desired = rbind(output_1, output_2, output_3, output_4)
## Failed apply ##
output_apply = apply(input_test, 1, function(x) {
result(x, teams)
}) %>% do.call("rbind", .)

Using lapply and dplyr::bind_rows you could do:
Note: I also fixed the creation of your input_test list for which I use lapply and get.
library(dplyr)
all_objects <- ls()
input_objects <- grep("^input_\\d", all_objects, value = T)
input_test <- lapply(input_objects, get)
input_test %>%
lapply(result, teams) %>%
bind_rows()
#> # A tibble: 12 × 5
#> V1.x V1.y V2 V3 V4
#> <chr> <chr> <chr> <chr> <dbl>
#> 1 team1 Team_2022 Mary + Frank Sydney 131
#> 2 team2 Team_2022 Mary + John Sydney 90
#> 3 team3 Team_2022 Mary + Bill Sydney 76
#> 4 team1 Team_2023 Mary + Frank Sydney 23
#> 5 team2 Team_2023 Mary + John Sydney 33
#> 6 team3 Team_2023 Mary + Bill Sydney 136
#> 7 team1 Team_2024 Mary + Frank Sydney 26
#> 8 team2 Team_2024 Mary + John Sydney 19
#> 9 team3 Team_2024 Mary + Bill Sydney 71
#> 10 team1 Team_2025 Mary + Frank Sydney 90
#> 11 team2 Team_2025 Mary + John Sydney 132
#> 12 team3 Team_2025 Mary + Bill Sydney 44
Or using purrr::map_df:
purrr::map_df(input_test, result, teams)

Related

Parse a function input name as an output name

I am trying to add the data input variable name as an output value in a separate column ($V5 within my function "result).
The inputs are different data values, and would like to save these names in the outputs so I can track where the data is coming from.
I thought this would be fairly straightforward, and that I could use print(deparse(substitute(input))) but this doesn't work.
Can anyone recommend a solution?
library(dplyr)
library(tidyr)
## Inputs ##
input_1 = structure(list(V1 = c("Team_2022", "Team_2022", "Team_2022"), V2 = c("Frank", "Mary", "John"), V3 = c("Sydney", "Sydney", "Sydney"), V4 = c(55, 76, 14)), row.names = c(NA, -3L), class = c("data.table", "data.frame"))
input_2 = structure(list(V1 = c("Team_2023", "Team_2023", "Team_2023"), V2 = c("Bill", "Mary", "John"), V3 = c("Sydney", "Sydney", "Sydney"), V4 = c(113, 23, 10)), row.names = c(NA, -3L), class = c("data.table", "data.frame"))
input_3 = structure(list(V1 = c("Team_2024", "Team_2024", "Team_2024"), V2 = c("Frank", "Mary", "Bill"), V3 = c("Sydney", "Sydney", "Sydney"), V4 = c(7, 19, 52)), row.names = c(NA, -3L), class = c("data.table", "data.frame"))
input_4 = structure(list(V1 = c("Team_2025", "Team_2025", "Team_2025"), V2 = c("Frank", "Mary", "John"), V3 = c("Sydney", "Sydney", "Sydney"), V4 = c(46, 44, 88)), row.names = c(NA, -3L), class = c("data.table", "data.frame"))
## Teams ##
teams = structure(list(V1 = c("team1", "team2", "team3"), V2 = c("Mary + Frank","Mary + John", "Mary + Bill")), class = "data.frame", row.names = c(NA, -3L))
## Group the inputs into one ##
all_objects = ls()
input_objects = grep("^input", all_objects, value = T)
input_test = as.data.frame(input_obj)
## Function ##
result = function(input, teams) {
data = teams %>%
separate_rows(V2) %>%
left_join(input, by = c("V2" = "V2")) %>%
replace_na(list(V4 = 0)) %>%
group_by(V1.x) %>% fill(V1.y, V3) %>%
summarize(V1.y = first(V1.y),
V2 = paste(V2, collapse = " + "),
V3 = first(V3),
V4 = sum(V4),
V5 = print(deparse(substitute(input))))
return(data)
}
all_objects <- ls()
input_objects <- grep("^input_\\d", all_objects, value = T)
input_test <- lapply(input_objects, get)
output = input_test %>%
lapply(result, teams) %>%
bind_rows()
### Current output ###
structure(list(V1.x = c("team1", "team2", "team3", "team1", "team2",
"team3", "team1", "team2", "team3", "team1", "team2", "team3"
), V1.y = c("Team_2022", "Team_2022", "Team_2022", "Team_2023",
"Team_2023", "Team_2023", "Team_2024", "Team_2024", "Team_2024",
"Team_2025", "Team_2025", "Team_2025"), V2 = c("Mary + Frank",
"Mary + John", "Mary + Bill", "Mary + Frank", "Mary + John",
"Mary + Bill", "Mary + Frank", "Mary + John", "Mary + Bill",
"Mary + Frank", "Mary + John", "Mary + Bill"), V3 = c("Sydney",
"Sydney", "Sydney", "Sydney", "Sydney", "Sydney", "Sydney", "Sydney",
"Sydney", "Sydney", "Sydney", "Sydney"), V4 = c(131, 90, 76,
23, 33, 136, 26, 19, 71, 90, 132, 44), V5 = c("input", "input",
"input", "input", "input", "input", "input", "input", "input",
"input", "input", "input")), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -12L))
### Desired Output ###
structure(list(V1.x = c("team1", "team2", "team3", "team1", "team2",
"team3", "team1", "team2", "team3", "team1", "team2", "team3"
), V1.y = c("Team_2022", "Team_2022", "Team_2022", "Team_2023",
"Team_2023", "Team_2023", "Team_2024", "Team_2024", "Team_2024",
"Team_2025", "Team_2025", "Team_2025"), V2 = c("Mary + Frank",
"Mary + John", "Mary + Bill", "Mary + Frank", "Mary + John",
"Mary + Bill", "Mary + Frank", "Mary + John", "Mary + Bill",
"Mary + Frank", "Mary + John", "Mary + Bill"), V3 = c("Sydney",
"Sydney", "Sydney", "Sydney", "Sydney", "Sydney", "Sydney", "Sydney",
"Sydney", "Sydney", "Sydney", "Sydney"), V4 = c(131, 90, 76,
23, 33, 136, 26, 19, 71, 90, 132, 44), V5 = c("input_1", "input_1",
"input_1", "input_2", "input_2", "input_2", "input_3", "input_3", "input_3",
"input_4", "input_4", "input_4")), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -12L))
If I understand you correctly: the purrr::map_df() function has a nice feature to identify inputs lists (ideally named) in final data.frames:
library(dplyr)
library(tidyr)
# dropping V5 as it will be "automatically" computed
result = function(input, teams) {
data = teams %>%
separate_rows(V2) %>%
left_join(input, by = c("V2" = "V2")) %>%
replace_na(list(V4 = 0)) %>%
group_by(V1.x) %>%
fill(V1.y, V3) %>%
summarize(V1.y = first(V1.y),
V2 = paste(V2, collapse = " + "),
V3 = first(V3),
V4 = sum(V4)
# we do not need V5 here anymore
)
return(data)
}
l_objects <- ls()
input_objects <- grep("^input_\\d", all_objects, value = T)
input_test <- lapply(input_objects, get)
# name the object list to use the name for identification
names(input_test) <- input_objects
# use purrr map to data.frame with the .id feature
purrr::map_df(input_test, ~result(.x, teams), .id = "V5")
V5 V1.x V1.y V2 V3 V4
<chr> <chr> <chr> <chr> <chr> <dbl>
1 input_1 team1 Team_2022 Mary + Frank Sydney 131
2 input_1 team2 Team_2022 Mary + John Sydney 90
3 input_1 team3 Team_2022 Mary + Bill Sydney 76
4 input_2 team1 Team_2023 Mary + Frank Sydney 23
5 input_2 team2 Team_2023 Mary + John Sydney 33
6 input_2 team3 Team_2023 Mary + Bill Sydney 136
7 input_3 team1 Team_2024 Mary + Frank Sydney 26
8 input_3 team2 Team_2024 Mary + John Sydney 19
9 input_3 team3 Team_2024 Mary + Bill Sydney 71
10 input_4 team1 Team_2025 Mary + Frank Sydney 90
11 input_4 team2 Team_2025 Mary + John Sydney 132
12 input_4 team3 Team_2025 Mary + Bill Sydney 44
Note that this works without naming the list aswell, though you will only get the list item number, which might be insufficient.
Also the plyr::ldply function can be used to bind named lists, generating a new column with the list names in the result data.frame.
Instead of doing the deparse/substitute, create an argument in result for the names as well, and then use that
library(dplyr)
library(purrr)
library(tidyr)
result <- function(input, teams, inputnm) {
data = teams %>%
separate_rows(V2) %>%
left_join(input, by = c("V2" = "V2")) %>%
replace_na(list(V4 = 0)) %>%
group_by(V1.x) %>% fill(V1.y, V3) %>%
summarize(V1.y = first(V1.y),
V2 = paste(V2, collapse = " + "),
V3 = first(V3),
V4 = sum(V4),
V5 = inputnm)
return(data)
}
-testing
input_test %>%
pull(input_objects) %>%
mget(inherits = TRUE) %>%
imap_dfr(~ result(.x, teams, .y))
-output
# A tibble: 12 × 6
V1.x V1.y V2 V3 V4 V5
<chr> <chr> <chr> <chr> <dbl> <chr>
1 team1 Team_2022 Mary + Frank Sydney 131 input_1
2 team2 Team_2022 Mary + John Sydney 90 input_1
3 team3 Team_2022 Mary + Bill Sydney 76 input_1
4 team1 Team_2023 Mary + Frank Sydney 23 input_2
5 team2 Team_2023 Mary + John Sydney 33 input_2
6 team3 Team_2023 Mary + Bill Sydney 136 input_2
7 team1 Team_2024 Mary + Frank Sydney 26 input_3
8 team2 Team_2024 Mary + John Sydney 19 input_3
9 team3 Team_2024 Mary + Bill Sydney 71 input_3
10 team1 Team_2025 Mary + Frank Sydney 90 input_4
11 team2 Team_2025 Mary + John Sydney 132 input_4
12 team3 Team_2025 Mary + Bill Sydney 44 input_4
If you just had one input at a time, simply moving the line for assigning V5 outside of your summarize and dplyr pipes would do it:
result = function(input, teams) {
data = teams %>%
separate_rows(V2) %>%
left_join(input, by = c("V2" = "V2")) %>%
replace_na(list(V4 = 0)) %>%
group_by(V1.x) %>% fill(V1.y, V3) %>%
summarize(V1.y = first(V1.y),
V2 = paste(V2, collapse = " + "),
V3 = first(V3),
V4 = sum(V4),
#V5 = print(deparse(substitute(input)))
)
data$V5 <- deparse(substitute(input))
return(data)
}
result(input_1, teams)
# V1.x V1.y V2 V3 V4 V5
# <chr> <chr> <chr> <chr> <dbl> <chr>
# 1 team1 Team_2022 Mary + Frank Sydney 131 input_1
# 2 team2 Team_2022 Mary + John Sydney 90 input_1
# 3 team3 Team_2022 Mary + Bill Sydney 76 input_1
But your use of lists makes that a bit challenging (it will output X[[i]] for that code).
To address this for using lapply, I would suggest simply adding an additional input that takes the name and simply assigning it that name, with additional tweaks to the lapply function to accommodate it:
result = function(input, teams, nme) {
data = teams %>%
separate_rows(V2) %>%
left_join(input, by = c("V2" = "V2")) %>%
replace_na(list(V4 = 0)) %>%
group_by(V1.x) %>% fill(V1.y, V3) %>%
summarize(V1.y = first(V1.y),
V2 = paste(V2, collapse = " + "),
V3 = first(V3),
V4 = sum(V4),
#V5 = print(deparse(substitute(input)))
)
data$V5 <- nme
return(data)
}
all_objects <- ls()
input_objects <- grep("^input_\\d", all_objects, value = T)
input_test <- lapply(input_objects, get)
# add in assigning names to the list
names(input_test) <- input_objects
output = lapply(input_objects, function(x) result(input_test[[x]], teams, nme = x)) %>%
bind_rows()
# V1.x V1.y V2 V3 V4 V5
# <chr> <chr> <chr> <chr> <dbl> <chr>
# 1 team1 Team_2022 Mary + Frank Sydney 131 input_1
# 2 team2 Team_2022 Mary + John Sydney 90 input_1
# 3 team3 Team_2022 Mary + Bill Sydney 76 input_1
# 4 team1 Team_2023 Mary + Frank Sydney 23 input_2
# 5 team2 Team_2023 Mary + John Sydney 33 input_2
# 6 team3 Team_2023 Mary + Bill Sydney 136 input_2
# 7 team1 Team_2024 Mary + Frank Sydney 26 input_3
# 8 team2 Team_2024 Mary + John Sydney 19 input_3
# 9 team3 Team_2024 Mary + Bill Sydney 71 input_3
# 10 team1 Team_2025 Mary + Frank Sydney 90 input_4
# 11 team2 Team_2025 Mary + John Sydney 132 input_4
# 12 team3 Team_2025 Mary + Bill Sydney 44 input_4

R: How to insert and repeat columns in between existing columns and name them accordingly

I have a dataframe that looks like this:
Col_1 Col_X_1 Col_2 Col_X_2 ...
ABC 890 AJF 341
JFH 183 DFJ 132
...
After each block of columns (e.g. Col_1 & Col_X_1) that belong together according to the number at the end, I want to insert two more (empty) columns with the names Col_Y_n and Col_Z_n, with n being the same number as the block of columns before.
The final dataframe should look like this:
Col_1 Col_X_1 Col_Y_1 Col_Z_1 Col_2 Col_X_2 Col_Y_2 Col_Z_2 ...
ABC 890 AJF 341
JFH 183 DFJ 132
...
How can I accomplish this?
Here my dput output of my real data:
structure(list(Company = c("CompanyA", "CompanyB"),
Team_1 = c("NameA", "NameB"), Team_Desc_1 = c("Founder & Co-CEO",
"Senior Blockchain Engineer"), Team_URL_1 = c("https://www.linkedin.com/in/NameA/",
NA), Team_Ver_1 = c("unverified", NA), Team_2 = c("NameC",
"NameD"), Team_Desc_2 = c("Chairman", "Senior Software Engineer"
), Team_URL_2 = c("https://www.linkedin.com/in/NameC/",
NA), Team_Ver_2 = c("unverified", NA), Team_3 = c("NameE",
"NameF")), class = c("grouped_df", "tbl_df", "tbl",
"data.frame"), row.names = c(NA, -2L), groups = structure(list(
Company = c("CompanyB", "CompanyA"), .rows = structure(list(
2L, 1L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -2L), .drop = TRUE))
In base R you can try something like this; the idea is to split() the data.frame in n data.frames in one list, one for each number in colnames, and then apply all the wrangling you need to each part of the list.
# transpose your data
df_t <- data.frame(t(df[,-1]))
# add a variabile made of the number in colnames
df_t$var <- gsub("[^0-9]", "", rownames(df_t))
# split into a list
df_t_list <- split(df_t, df_t$var)
# EDIT: if you have different names, like 1, 12, 3, you can order the
# list then continue the work
sorting <- sort(as.numeric(names(df_t_list)))
df_t_list <- df_t_list[as.character(sorting)]
Now for each element of the list, we do some operations: we use lapply() function and a last for loop:
# remove the useless column used to split
df_t_list <- lapply(df_t_list, function(x) { x["var"] <- NULL; data.frame(t(x)) })
# add the columns you need
df_t_list <- lapply(df_t_list, function(x) { x$col_Y_ <- NA; x$col_Z_ <- NA;x })
We have columns with the last number on (old columns) and the new columns without it. Let's remove it from all the columns then add it to all.
# remove
df_t_list <- lapply(df_t_list, function(x) {colnames(x) <- gsub("[0-9]", "", colnames(x));x})
# add
for(i in seq_along(df_t_list)) {colnames(df_t_list[[i]]) <- paste0(colnames(df_t_list[[i]]),names(df_t_list)[i])}
Let's get everything together:
do.call(cbind, unname(df_t_list))
Team_1 Team_Desc_1 Team_URL_1 Team_Ver_1 col_Y_1 col_Z_1 Team_3 col_Y_3 col_Z_3 Team_12 Team_Desc_12
X1 NameA Founder & Co-CEO https://www.linkedin.com/in/NameA/ unverified NA NA NameE NA NA NameC Chairman
X2 NameB Senior Blockchain Engineer <NA> <NA> NA NA NameF NA NA NameD Senior Software Engineer
Team_URL_12 Team_Ver_12 col_Y_12 col_Z_12
X1 https://www.linkedin.com/in/NameC/ unverified NA NA
X2 <NA> <NA> NA NA
With new data:
structure(list(Company = c("CompanyA", "CompanyB"), Team_1 = c("NameA",
"NameB"), Team_Desc_1 = c("Founder & Co-CEO", "Senior Blockchain Engineer"
), Team_URL_1 = c("https://www.linkedin.com/in/NameA/", NA),
Team_Ver_1 = c("unverified", NA), Team_12 = c("NameC", "NameD"
), Team_Desc_12 = c("Chairman", "Senior Software Engineer"
), Team_URL_12 = c("https://www.linkedin.com/in/NameC/",
NA), Team_Ver_12 = c("unverified", NA), Team_3 = c("NameE",
"NameF")), class = c("grouped_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -2L), groups = structure(list(Company = c("CompanyB",
"CompanyA"), .rows = structure(list(2L, 1L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -2L), .drop = TRUE))
Here's a way to do what you want using data.table and stringr.
library(data.table)
library(stringr)
group_ids <- str_extract(names(data), "[0-9]*$")
group_lvls <- factor(group_ids, levels=unique(group_ids))
groups <- split(colnames(data), group_lvls)
add_empty <- function(x){
fun_id <- stringr::str_extract(x[1], "[0-9]*$")
x <- c(x, paste0("Col_Y_", fun_id), paste0("Col_Z_", fun_id) )
return(x)
}
fnl_groups <- lapply(groups, add_empty)
struckt1 <- lapply(unlist(fnl_groups, use.names = FALSE), function(x) {df <- data.frame(col = NA); names(df) <- x; return(df)})
struckt2 <- do.call(cbind, struckt1)
res <- rbindlist(list(struckt2, data), use.names = TRUE, fill = TRUE)[-1,]
setcolorder(res, neworder = "Company")
Output:
Company Team_1 Team_Desc_1 Team_URL_1 Team_Ver_1 Col_Y_1 Col_Z_1 Team_2 Team_Desc_2
1: CompanyA NameA Founder & Co-CEO https://www.linkedin.com/in/NameA/ unverified NA NA NameC Chairman
2: CompanyB NameB Senior Blockchain Engineer <NA> <NA> NA NA NameD Senior Software Engineer
Team_URL_2 Team_Ver_2 Col_Y_2 Col_Z_2 Team_3 Col_Y_3 Col_Z_3
1: https://www.linkedin.com/in/NameC/ unverified NA NA NameE NA NA
2: <NA> <NA> NA NA NameF NA NA
Edit: Addressing the comment below, the script doesn't seem to affect the ordering of other columns.
Altered data (replaced Team 3 with Team 13)
structure(
list(
Company = c("CompanyA", "CompanyB"),
Team_1 = c("NameA", "NameB"),
Team_Desc_1 = c("Founder & Co-CEO",
"Senior Blockchain Engineer"),
Team_URL_1 = c("https://www.linkedin.com/in/NameA/",
NA),
Team_Ver_1 = c("unverified", NA),
Team_2 = c("NameC",
"NameD"),
Team_Desc_2 = c("Chairman", "Senior Software Engineer"),
Team_URL_2 = c("https://www.linkedin.com/in/NameC/",
NA),
Team_Ver_2 = c("unverified", NA),
Team_13 = c("NameE",
"NameF")
),
class = c("grouped_df", "tbl_df", "tbl",
"data.frame"),
row.names = c(NA,-2L),
groups = structure(
list(
Company = c("CompanyB", "CompanyA"),
.rows = structure(
list(2L, 1L),
ptype = integer(0),
class = c("vctrs_list_of",
"vctrs_vctr", "list")
)
),
class = c("tbl_df", "tbl", "data.frame"),
row.names = c(NA,-2L),
.drop = TRUE
)
) -> data
Output:
> res
Company Col_Y_ Col_Z_ Team_1 Team_Desc_1 Team_URL_1 Team_Ver_1 Col_Y_1 Col_Z_1 Team_2
1: CompanyA NA NA NameA Founder & Co-CEO https://www.linkedin.com/in/NameA/ unverified NA NA NameC
2: CompanyB NA NA NameB Senior Blockchain Engineer <NA> <NA> NA NA NameD
Team_Desc_2 Team_URL_2 Team_Ver_2 Col_Y_2 Col_Z_2 Team_13 Col_Y_13 Col_Z_13
1: Chairman https://www.linkedin.com/in/NameC/ unverified NA NA NameE NA NA
2: Senior Software Engineer <NA> <NA> NA NA NameF NA NA

Quicker way? Remove rows in book1, take row 4 values as column name, set some column name same as book2

Below is the first dataframe where I want to remove the first 3 rows:
book1 <- structure(list(Instructions..xyz = c("Note: abc", "", "Set1",
"id", "632592651", "633322173", "634703802", "634927873", "635812953",
"636004739", "636101211", "636157799", "636263106", "636752420"
), X = c("", "", "", "title", "asdf", "cat", "dog", "mouse",
"elephant", "goose", "rat", "mice", "kitty", "kitten"), X.1 = c("",
"", "", "hazard", "y", "y", "y", "n", "n", "y", "y", "n", "n",
"y"), X.2 = c("", "", "Set2", "id", "632592651", "633322173",
"634703802", "634927873", "635812953", "636004739", "636101211",
"636157799", "636263106", "636752420"), X.3 = c("", "", "", "title",
"asdf2", "cat2", "dog2", "mouse2", "elephant2", "goose2", "rat2",
"mice2", "kitty2", "kitten2"), X.4 = c("", "", "", "index", "0.664883807",
"0.20089779", "0.752228086", "0.124729276", "0.626285086", "0.134537909",
"0.612526768", "0.769622463", "0.682532524", "0.819015658")), class = "data.frame", row.names = c(NA,
-14L))
I did book1 <- book1[-c(1:3),] but I'm not sure how to make id, title, hazard, id, title, index as the column name instead of Instructions..xyz, etc. See image below for desired output
Then for the second dataframe,
book2 <- structure(list(identity = c(632592651L, 633322173L, 634703802L,
634927873L, 635812953L, 636004739L, 636101211L, 636157799L, 636263106L,
636752420L, 636809222L, 2004722036L, 2004894388L, 2005045755L,
2005535472L, 2005630542L, 2005788781L, 2005809679L, 2005838317L,
2005866692L), text = c("asdf_xyz", "cat", "dog", "mouse", "elephant",
"goose", "rat", "mice", "kitty", "kitten", "tiger_xyz", "lion",
"leopard", "ostrich", "kangaroo", "platypus", "fish", "reptile",
"mammals", "amphibians_xyz"), volume = c(1234L, 432L, 324L, 333L,
2223L, 412346L, 7456L, 3456L, 2345L, 2345L, 6L, 345L, 23L, 2L,
4778L, 234L, 8675L, 3459L, 8L, 9L)), class = "data.frame", row.names = c(NA,
-20L))
I then rename column 1 and 2 in book2 so that it matches that of book1 by names(book2)[1:2] <- c('id','title') where I can later do inner_join. The desired output is shown in the image below by
library(dplyr)
book1 %>%
inner_join(book2, by = c("id", "title"))
This is taking quite a few steps and wondering if there's a simplified version to this?
Something like this?
# split the data by columns
book2a <- book1[-(1:4), 1:3]
book2b <- book1[-(1:4), 4:6]
# take care of names
names(book2a) <- book1[4, 1:3, drop = TRUE]
names(book2b) <- book1[4, 4:6, drop = TRUE]
# book2b needs processing
book2b$title <- sub("2", "", book2b$title)
book2b$index <- as.numeric(book2b$index)
# join both data sets and clean-up
book2 <- merge(book2a, book2b, all = TRUE)
rm(book2a, book2b)
book2
#> id title hazard index
#> 1 632592651 asdf y 0.6648838
#> 2 633322173 cat y 0.2008978
#> 3 634703802 dog y 0.7522281
#> 4 634927873 mouse n 0.1247293
#> 5 635812953 elephant n 0.6262851
#> 6 636004739 goose y 0.1345379
#> 7 636101211 rat y 0.6125268
#> 8 636157799 mice n 0.7696225
#> 9 636263106 kitty n 0.6825325
#> 10 636752420 kitten y 0.8190157
Created on 2022-06-25 by the reprex package (v2.0.1)
Found the solution to the first question
library(janitor)
book1 <- row_to_names(dat=book1, row_number=4, remove_row = TRUE, remove_rows_above = TRUE)
I applied
names(book1)[4:5] <- c('id1','title1')
to obtain unique column name, then tried inner_join as proposed earlier but with error and found that book1$id is character where book2$id is int and so I did
book1$id <- as.integer(book1$id)
and finally it works with
library(tidyverse)
Yeah <- book1 %>%
inner_join(book2, by = c("id", "title"))
Output below:
id title hazard id1 title1 index volume
1 633322173 cat y 633322173 cat2 0.20089779 432
2 634703802 dog y 634703802 dog2 0.752228086 324
3 634927873 mouse n 634927873 mouse2 0.124729276 333
4 635812953 elephant n 635812953 elephant2 0.626285086 2223
5 636004739 goose y 636004739 goose2 0.134537909 412346
6 636101211 rat y 636101211 rat2 0.612526768 7456
7 636157799 mice n 636157799 mice2 0.769622463 3456
8 636263106 kitty n 636263106 kitty2 0.682532524 2345
9 636752420 kitten y 636752420 kitten2 0.819015658 2345
Still wondering if there's a quicker way?

Perform a series of mutations to columns in dataframe

I am trying to replace some text in my dataframe (a few rows given below)
> dput(Henry.longer[1:4,])
structure(list(N_l = c(4, 4, 4, 4), UG = c("100", "100", "100",
"100"), S = c(12, 12, 12, 12), Sample = c(NA, NA, NA, NA), EQ = c("Henry",
"Henry", "Henry", "Henry"), DF = c(0.798545454545455, 0.798545454545455,
0.798545454545455, 0.798545454545455), meow = c("Henry.Exterior.single",
"Multi", "Henry.Exterior.multi", "Henry.Interior.single"), Girder = c("Henry.Exterior.single",
"Henry.Interior.multi", "Henry.Exterior.multi", "Interior")), row.names = c(NA,
-4L), groups = structure(list(UG = "100", S = 12, .rows = list(
1:4)), row.names = c(NA, -1L), class = c("tbl_df", "tbl",
"data.frame"), .drop = FALSE), class = c("grouped_df", "tbl_df",
"tbl", "data.frame"))
I try to mutate the dataframe as:
Henry.longer <- Henry.longer %>%
mutate(Loading = str_replace(meow, "Henry.Exterior.single", "Single")) %>%
mutate(Loading = str_replace(meow, "Henry.Exterior.multi", "Multi")) %>%
mutate(Loading = str_replace(meow, "Henry.Interior.single", "Single")) %>%
mutate(Loading = str_replace(meow, "Henry.Interior.multi", "Multi")) %>%
mutate(Girder = str_replace(meow, "Henry.Exterior.multi", "Exterior")) %>%
mutate(Girder = str_replace(meow, "Henry.Exterior.single", "Exterior")) %>%
mutate(Girder = str_replace(meow, "Henry.Interior.multi", "Interior")) %>%
mutate(Girder = str_replace(meow, "Henry.Interior.single", "Interior")) %>%
select(-meow)
But for some reason the results does not get applied to all the rows and only:
N_l UG S Sample EQ DF Loading Girder
1 4 100 12 NA Henry 0.799 Henry.Exterior.single Henry.Exterior.single
2 4 100 12 NA Henry 0.799 Multi Henry.Interior.multi
3 4 100 12 NA Henry 0.799 Henry.Exterior.multi Henry.Exterior.multi
4 4 100 12 NA Henry 0.799 Henry.Interior.single Interior
I think we can use lookup vectors for this, if it's easy or safer to use static string lookups:
tr_vec <- c(Henry.Exterior.single = "Single", Henry.Exterior.multi = "Multi", Henry.Interior.single = "Single", Henry.Interior.multi = "Multi")
tr_vec2 <- c(Henry.Exterior.multi = "Exterior", Henry.Exterior.single = "Exterior", Henry.Interior.multi = "Interior", Henry.Interior.single = "Interior")
Henry.longer %>%
mutate(
Loading = coalesce(tr_vec[Loading], Loading),
Girder = coalesce(tr_vec2[Girder], Girder)
)
# # A tibble: 4 x 8
# # Groups: UG, S [1]
# N_l UG S Sample EQ DF Loading Girder
# <dbl> <chr> <dbl> <lgl> <chr> <dbl> <chr> <chr>
# 1 4 100 12 NA Henry 0.799 Single Exterior
# 2 4 100 12 NA Henry 0.799 Multi Interior
# 3 4 100 12 NA Henry 0.799 Multi Exterior
# 4 4 100 12 NA Henry 0.799 Single Interior
The advantage of RonakShah's regex solution is that it can very easily handle many of the types of substrings you appear to need. Regexes do carry a little risk, though, in that they may (unlikely in that answer, but) miss match.
Instead of using str_replace I guess it would be easier to extract what you want using regex.
library(dplyr)
Henry.longer %>%
mutate(Loading = sub('.*\\.', '', meow),
Girder = sub('.*\\.(\\w+)\\..*', '\\1', meow))
where
Loading - removes everything until last dot
Girder - extracts a word between two dots.
Oh boy, looks like you've got some answers here already but here's a super-simple one that uses stringr::str_extract:
Henry.longer <- Henry.longer %>%
mutate(Loading = str_extract(meow, "single|multi")) %>%
mutate(Girder = str_extract(meow, "Interior|Exterior"))
It's worth noting that the demo data has a weird entry for meow in one column, so it didn't run perfectly on my machine:

Addition of specific rows in a dataframe

I'm trying to add specific row of a data frame together.
And short of using grepl to find lines and then rbinding them to the bottom, I'm not sure if there's a better way to do this.
this is my input df:
input = structure(list(
V1 = c("Sales", "Sales", "Sales", "Sales", "Sales","Sales"),
V2 = c("Johnny", "Meg", "Fred", "Johnny", "Meg", "Fred"),
V3 = c("Australia", "Australia", "Australia", "NZ", "NZ","NZ"),
V4 = c(154L, 1898L, 175L, 1235L, 23L, 255L)), row.names = c(NA,6L),
class = "data.frame")
and this is my expected output:
structure(list(
V1 = c("Sales", "Sales", "Sales", "Sales", "Sales",
"Sales", "Sales", "Sales", "Sales", "Sales", "Sales", "Sales"),
V2 = c("Johnny", "Meg", "Fred", "Johnny", "Meg", "Fred", "Johnny + Fred",
"Meg + Fred", "Johnny + Meg + Fred", "Johnny + Fred", "Meg + Fred",
"Johnny + Meg + Fred"),
V3 = c("Australia", "Australia", "Australia", "NZ",
"NZ", "NZ", "Australia", "Australia", "Australia", "NZ", "NZ", "NZ"),
V4 = c(154L, 1898L, 175L, 1235L, 23L, 255L, 329L, 2073L, 2227L, 1490L, 278L, 1513L)),
class = "data.frame", row.names = c(NA, -12L)
)
I would've thought there's a better way to there's a better way of adding these rows that filtering and then adding, and then joining etc.
Can anyone point me in the right direction of what I should be looking for?
I solve the problem using combn
Data input part
input = structure(list(
V1 = c("Sales", "Sales", "Sales", "Sales", "Sales","Sales"),
V2 = c("Johnny", "Meg", "Fred", "Johnny", "Meg", "Fred"),
V3 = c("Australia", "Australia", "Australia", "NZ", "NZ","NZ"),
V4 = c(154L, 1898L, 175L, 1235L, 23L, 255L)), row.names = c(NA,6L),
class = "data.frame")
structure(list(
V1 = c("Sales", "Sales", "Sales", "Sales", "Sales",
"Sales", "Sales", "Sales", "Sales", "Sales", "Sales", "Sales"),
V2 = c("Johnny", "Meg", "Fred", "Johnny", "Meg", "Fred", "Johnny + Fred",
"Meg + Fred", "Johnny + Meg + Fred", "Johnny + Fred", "Meg + Fred",
"Johnny + Meg + Fred"),
V3 = c("Australia", "Australia", "Australia", "NZ",
"NZ", "NZ", "Australia", "Australia", "Australia", "NZ", "NZ", "NZ"),
V4 = c(154L, 1898L, 175L, 1235L, 23L, 255L, 329L, 2073L, 2227L, 1490L, 278L, 1513L)),
class = "data.frame", row.names = c(NA, -12L)
)
Solution
library(dplyr)
TT = unique(input$V2)
> TT
[1] "Johnny" "Meg" "Fred"
comb2 = combn(TT,2,simplify = FALSE)
> comb2
[[1]]
[1] "Johnny" "Meg"
[[2]]
[1] "Johnny" "Fred"
[[3]]
[1] "Meg" "Fred"
comb3 = combn(TT,3,simplify = FALSE)
> comb3
[[1]]
[1] "Johnny" "Meg" "Fred"
result = function(data){
purrr::map_df(lapply(data,function(x){paste(x,collapse = '|')}), function(x){
df = input[grepl(x,input$V2),] %>% group_by(V3)%>%summarize(V1= 'Sales',
V2= paste(V2,collapse = '+'),
V4= sum(V4))
return(df)
}
)
}
Result
result(comb2)
# A tibble: 6 x 4
V3 V1 V2 V4
<chr> <chr> <chr> <int>
1 Australia Sales Johnny+Meg 2052
2 NZ Sales Johnny+Meg 1258
3 Australia Sales Johnny+Fred 329
4 NZ Sales Johnny+Fred 1490
5 Australia Sales Meg+Fred 2073
6 NZ Sales Meg+Fred 278
result(comb3)
# A tibble: 2 x 4
V3 V1 V2 V4
<chr> <chr> <chr> <int>
1 Australia Sales Johnny+Meg+Fred 2227
2 NZ Sales Johnny+Meg+Fred 1513
finalResult = bind_rows(A,B,input) %>%
select(V1,V2,V3,V4) %>% filter(! V2 %in% c('Johnny+Meg'))
> finalResult
# A tibble: 12 x 4
V1 V2 V3 V4
<chr> <chr> <chr> <int>
1 Sales Johnny+Fred Australia 329
2 Sales Johnny+Fred NZ 1490
3 Sales Meg+Fred Australia 2073
4 Sales Meg+Fred NZ 278
5 Sales Johnny+Meg+Fred Australia 2227
6 Sales Johnny+Meg+Fred NZ 1513
7 Sales Johnny Australia 154
8 Sales Meg Australia 1898
9 Sales Fred Australia 175
10 Sales Johnny NZ 1235
11 Sales Meg NZ 23
12 Sales Fred NZ 255
Using tidyverse we can first split the dataframe based on V3 then create combination of names and add sum to create a new tibble and bind it to the original dataframe.
library(tidyverse)
input %>%
bind_rows(input %>%
group_split(V3) %>%
map_dfr(function(x) map_dfr(2:nrow(x), ~tibble(
V1 = first(x$V1),
V2 = combn(x$V2, ., paste, collapse = " + "),
V3 = first(x$V3),
V4 = combn(x$V4, .,sum)) %>%
filter(grepl("\\bFred\\b", V2)))))
# V1 V2 V3 V4
#1 Sales Johnny Australia 154
#2 Sales Meg Australia 1898
#3 Sales Fred Australia 175
#4 Sales Johnny NZ 1235
#5 Sales Meg NZ 23
#6 Sales Fred NZ 255
#7 Sales Johnny + Fred Australia 329
#8 Sales Meg + Fred Australia 2073
#9 Sales Johnny + Meg + Fred Australia 2227
#10 Sales Johnny + Fred NZ 1490
#11 Sales Meg + Fred NZ 278
#12 Sales Johnny + Meg + Fred NZ 1513
Using the same logic but in base R, we can do
rbind(input, do.call(rbind, lapply(split(input, input$V3), function(x)
do.call(rbind, lapply(2:nrow(x), function(y)
subset(data.frame(V1 = x$V1[1],
V2 = combn(x$V2, y, paste, collapse = " + "),
V3 = x$V3[1],
V4 = combn(x$V4, y, sum)),
grepl("\\bFred\\b", V2)))))))

Resources