I am trying to add the data input variable name as an output value in a separate column ($V5 within my function "result).
The inputs are different data values, and would like to save these names in the outputs so I can track where the data is coming from.
I thought this would be fairly straightforward, and that I could use print(deparse(substitute(input))) but this doesn't work.
Can anyone recommend a solution?
library(dplyr)
library(tidyr)
## Inputs ##
input_1 = structure(list(V1 = c("Team_2022", "Team_2022", "Team_2022"), V2 = c("Frank", "Mary", "John"), V3 = c("Sydney", "Sydney", "Sydney"), V4 = c(55, 76, 14)), row.names = c(NA, -3L), class = c("data.table", "data.frame"))
input_2 = structure(list(V1 = c("Team_2023", "Team_2023", "Team_2023"), V2 = c("Bill", "Mary", "John"), V3 = c("Sydney", "Sydney", "Sydney"), V4 = c(113, 23, 10)), row.names = c(NA, -3L), class = c("data.table", "data.frame"))
input_3 = structure(list(V1 = c("Team_2024", "Team_2024", "Team_2024"), V2 = c("Frank", "Mary", "Bill"), V3 = c("Sydney", "Sydney", "Sydney"), V4 = c(7, 19, 52)), row.names = c(NA, -3L), class = c("data.table", "data.frame"))
input_4 = structure(list(V1 = c("Team_2025", "Team_2025", "Team_2025"), V2 = c("Frank", "Mary", "John"), V3 = c("Sydney", "Sydney", "Sydney"), V4 = c(46, 44, 88)), row.names = c(NA, -3L), class = c("data.table", "data.frame"))
## Teams ##
teams = structure(list(V1 = c("team1", "team2", "team3"), V2 = c("Mary + Frank","Mary + John", "Mary + Bill")), class = "data.frame", row.names = c(NA, -3L))
## Group the inputs into one ##
all_objects = ls()
input_objects = grep("^input", all_objects, value = T)
input_test = as.data.frame(input_obj)
## Function ##
result = function(input, teams) {
data = teams %>%
separate_rows(V2) %>%
left_join(input, by = c("V2" = "V2")) %>%
replace_na(list(V4 = 0)) %>%
group_by(V1.x) %>% fill(V1.y, V3) %>%
summarize(V1.y = first(V1.y),
V2 = paste(V2, collapse = " + "),
V3 = first(V3),
V4 = sum(V4),
V5 = print(deparse(substitute(input))))
return(data)
}
all_objects <- ls()
input_objects <- grep("^input_\\d", all_objects, value = T)
input_test <- lapply(input_objects, get)
output = input_test %>%
lapply(result, teams) %>%
bind_rows()
### Current output ###
structure(list(V1.x = c("team1", "team2", "team3", "team1", "team2",
"team3", "team1", "team2", "team3", "team1", "team2", "team3"
), V1.y = c("Team_2022", "Team_2022", "Team_2022", "Team_2023",
"Team_2023", "Team_2023", "Team_2024", "Team_2024", "Team_2024",
"Team_2025", "Team_2025", "Team_2025"), V2 = c("Mary + Frank",
"Mary + John", "Mary + Bill", "Mary + Frank", "Mary + John",
"Mary + Bill", "Mary + Frank", "Mary + John", "Mary + Bill",
"Mary + Frank", "Mary + John", "Mary + Bill"), V3 = c("Sydney",
"Sydney", "Sydney", "Sydney", "Sydney", "Sydney", "Sydney", "Sydney",
"Sydney", "Sydney", "Sydney", "Sydney"), V4 = c(131, 90, 76,
23, 33, 136, 26, 19, 71, 90, 132, 44), V5 = c("input", "input",
"input", "input", "input", "input", "input", "input", "input",
"input", "input", "input")), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -12L))
### Desired Output ###
structure(list(V1.x = c("team1", "team2", "team3", "team1", "team2",
"team3", "team1", "team2", "team3", "team1", "team2", "team3"
), V1.y = c("Team_2022", "Team_2022", "Team_2022", "Team_2023",
"Team_2023", "Team_2023", "Team_2024", "Team_2024", "Team_2024",
"Team_2025", "Team_2025", "Team_2025"), V2 = c("Mary + Frank",
"Mary + John", "Mary + Bill", "Mary + Frank", "Mary + John",
"Mary + Bill", "Mary + Frank", "Mary + John", "Mary + Bill",
"Mary + Frank", "Mary + John", "Mary + Bill"), V3 = c("Sydney",
"Sydney", "Sydney", "Sydney", "Sydney", "Sydney", "Sydney", "Sydney",
"Sydney", "Sydney", "Sydney", "Sydney"), V4 = c(131, 90, 76,
23, 33, 136, 26, 19, 71, 90, 132, 44), V5 = c("input_1", "input_1",
"input_1", "input_2", "input_2", "input_2", "input_3", "input_3", "input_3",
"input_4", "input_4", "input_4")), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -12L))
If I understand you correctly: the purrr::map_df() function has a nice feature to identify inputs lists (ideally named) in final data.frames:
library(dplyr)
library(tidyr)
# dropping V5 as it will be "automatically" computed
result = function(input, teams) {
data = teams %>%
separate_rows(V2) %>%
left_join(input, by = c("V2" = "V2")) %>%
replace_na(list(V4 = 0)) %>%
group_by(V1.x) %>%
fill(V1.y, V3) %>%
summarize(V1.y = first(V1.y),
V2 = paste(V2, collapse = " + "),
V3 = first(V3),
V4 = sum(V4)
# we do not need V5 here anymore
)
return(data)
}
l_objects <- ls()
input_objects <- grep("^input_\\d", all_objects, value = T)
input_test <- lapply(input_objects, get)
# name the object list to use the name for identification
names(input_test) <- input_objects
# use purrr map to data.frame with the .id feature
purrr::map_df(input_test, ~result(.x, teams), .id = "V5")
V5 V1.x V1.y V2 V3 V4
<chr> <chr> <chr> <chr> <chr> <dbl>
1 input_1 team1 Team_2022 Mary + Frank Sydney 131
2 input_1 team2 Team_2022 Mary + John Sydney 90
3 input_1 team3 Team_2022 Mary + Bill Sydney 76
4 input_2 team1 Team_2023 Mary + Frank Sydney 23
5 input_2 team2 Team_2023 Mary + John Sydney 33
6 input_2 team3 Team_2023 Mary + Bill Sydney 136
7 input_3 team1 Team_2024 Mary + Frank Sydney 26
8 input_3 team2 Team_2024 Mary + John Sydney 19
9 input_3 team3 Team_2024 Mary + Bill Sydney 71
10 input_4 team1 Team_2025 Mary + Frank Sydney 90
11 input_4 team2 Team_2025 Mary + John Sydney 132
12 input_4 team3 Team_2025 Mary + Bill Sydney 44
Note that this works without naming the list aswell, though you will only get the list item number, which might be insufficient.
Also the plyr::ldply function can be used to bind named lists, generating a new column with the list names in the result data.frame.
Instead of doing the deparse/substitute, create an argument in result for the names as well, and then use that
library(dplyr)
library(purrr)
library(tidyr)
result <- function(input, teams, inputnm) {
data = teams %>%
separate_rows(V2) %>%
left_join(input, by = c("V2" = "V2")) %>%
replace_na(list(V4 = 0)) %>%
group_by(V1.x) %>% fill(V1.y, V3) %>%
summarize(V1.y = first(V1.y),
V2 = paste(V2, collapse = " + "),
V3 = first(V3),
V4 = sum(V4),
V5 = inputnm)
return(data)
}
-testing
input_test %>%
pull(input_objects) %>%
mget(inherits = TRUE) %>%
imap_dfr(~ result(.x, teams, .y))
-output
# A tibble: 12 × 6
V1.x V1.y V2 V3 V4 V5
<chr> <chr> <chr> <chr> <dbl> <chr>
1 team1 Team_2022 Mary + Frank Sydney 131 input_1
2 team2 Team_2022 Mary + John Sydney 90 input_1
3 team3 Team_2022 Mary + Bill Sydney 76 input_1
4 team1 Team_2023 Mary + Frank Sydney 23 input_2
5 team2 Team_2023 Mary + John Sydney 33 input_2
6 team3 Team_2023 Mary + Bill Sydney 136 input_2
7 team1 Team_2024 Mary + Frank Sydney 26 input_3
8 team2 Team_2024 Mary + John Sydney 19 input_3
9 team3 Team_2024 Mary + Bill Sydney 71 input_3
10 team1 Team_2025 Mary + Frank Sydney 90 input_4
11 team2 Team_2025 Mary + John Sydney 132 input_4
12 team3 Team_2025 Mary + Bill Sydney 44 input_4
If you just had one input at a time, simply moving the line for assigning V5 outside of your summarize and dplyr pipes would do it:
result = function(input, teams) {
data = teams %>%
separate_rows(V2) %>%
left_join(input, by = c("V2" = "V2")) %>%
replace_na(list(V4 = 0)) %>%
group_by(V1.x) %>% fill(V1.y, V3) %>%
summarize(V1.y = first(V1.y),
V2 = paste(V2, collapse = " + "),
V3 = first(V3),
V4 = sum(V4),
#V5 = print(deparse(substitute(input)))
)
data$V5 <- deparse(substitute(input))
return(data)
}
result(input_1, teams)
# V1.x V1.y V2 V3 V4 V5
# <chr> <chr> <chr> <chr> <dbl> <chr>
# 1 team1 Team_2022 Mary + Frank Sydney 131 input_1
# 2 team2 Team_2022 Mary + John Sydney 90 input_1
# 3 team3 Team_2022 Mary + Bill Sydney 76 input_1
But your use of lists makes that a bit challenging (it will output X[[i]] for that code).
To address this for using lapply, I would suggest simply adding an additional input that takes the name and simply assigning it that name, with additional tweaks to the lapply function to accommodate it:
result = function(input, teams, nme) {
data = teams %>%
separate_rows(V2) %>%
left_join(input, by = c("V2" = "V2")) %>%
replace_na(list(V4 = 0)) %>%
group_by(V1.x) %>% fill(V1.y, V3) %>%
summarize(V1.y = first(V1.y),
V2 = paste(V2, collapse = " + "),
V3 = first(V3),
V4 = sum(V4),
#V5 = print(deparse(substitute(input)))
)
data$V5 <- nme
return(data)
}
all_objects <- ls()
input_objects <- grep("^input_\\d", all_objects, value = T)
input_test <- lapply(input_objects, get)
# add in assigning names to the list
names(input_test) <- input_objects
output = lapply(input_objects, function(x) result(input_test[[x]], teams, nme = x)) %>%
bind_rows()
# V1.x V1.y V2 V3 V4 V5
# <chr> <chr> <chr> <chr> <dbl> <chr>
# 1 team1 Team_2022 Mary + Frank Sydney 131 input_1
# 2 team2 Team_2022 Mary + John Sydney 90 input_1
# 3 team3 Team_2022 Mary + Bill Sydney 76 input_1
# 4 team1 Team_2023 Mary + Frank Sydney 23 input_2
# 5 team2 Team_2023 Mary + John Sydney 33 input_2
# 6 team3 Team_2023 Mary + Bill Sydney 136 input_2
# 7 team1 Team_2024 Mary + Frank Sydney 26 input_3
# 8 team2 Team_2024 Mary + John Sydney 19 input_3
# 9 team3 Team_2024 Mary + Bill Sydney 71 input_3
# 10 team1 Team_2025 Mary + Frank Sydney 90 input_4
# 11 team2 Team_2025 Mary + John Sydney 132 input_4
# 12 team3 Team_2025 Mary + Bill Sydney 44 input_4
Related
For this week's tidytuesday challenge, for some reason, I am not able to group the column names in R which I was doing with pivot_longer function from tidyr previously. So, here is my code and I do not get it why it does throw an error and not give what I want.
library(tidyverse)
tuesdata <- tidytuesdayR::tt_load(2023, week = 7)
age_gaps <- tuesdata$age_gaps
df_long <- age_gaps %>%
pivot_longer(cols= actor_1_name:actor_2_name, names_to = "actornumber", values_to = "actorname") %>%
pivot_longer(cols= character_1_gender:character_2_gender, names_to = "gendernumber", values_to = "gender") %>%
pivot_longer(cols= actor_1_age:actor_2_age, names_to = "agenumber", values_to = "age") %>%
select(movie_name, release_year, director, age_difference, actorname, gender, age)
As seen from the code, the initial data has 1155 rows and after doing the quick data wrangling, I am expecting to get a data of 1155x2=2310 rows as I would like to merge the columns on actor names and their relevant information such as age and birthdate. Yet, the code does not give me the expected outcome and I am wondering why and how can I solve this problem. Thank you for your attention beforehand.
Example data (first 6 rows)
age_gaps <- structure(list(movie_name = c("Harold and Maude", "Venus", "The Quiet American",
"The Big Lebowski", "Beginners", "Poison Ivy"), release_year = c(1971,
2006, 2002, 1998, 2010, 1992), director = c("Hal Ashby", "Roger Michell",
"Phillip Noyce", "Joel Coen", "Mike Mills", "Katt Shea"), age_difference = c(52,
50, 49, 45, 43, 42), couple_number = c(1, 1, 1, 1, 1, 1), actor_1_name = c("Ruth Gordon",
"Peter O'Toole", "Michael Caine", "David Huddleston", "Christopher Plummer",
"Tom Skerritt"), actor_2_name = c("Bud Cort", "Jodie Whittaker",
"Do Thi Hai Yen", "Tara Reid", "Goran Visnjic", "Drew Barrymore"
), character_1_gender = c("woman", "man", "man", "man", "man",
"man"), character_2_gender = c("man", "woman", "woman", "woman",
"man", "woman"), actor_1_birthdate = structure(c(-26725, -13666,
-13442, -14351, -14629, -13278), class = "Date"), actor_2_birthdate = structure(c(-7948,
4536, 4656, 2137, 982, 1878), class = "Date"), actor_1_age = c(75,
74, 69, 68, 81, 59), actor_2_age = c(23, 24, 20, 23, 38, 17)), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"))
You could set ".value" in names_to and supply one of names_sep or names_pattern to specify how the column names should be split.
library(tidyr)
age_gaps %>%
pivot_longer(actor_1_name:actor_2_age,
names_prefix = "(actor|character)_",
names_to = c("actor", ".value"),
names_sep = '_')
# A tibble: 12 × 10
movie_name release_year director age_difference couple_number actor name gender birthdate age
<chr> <dbl> <chr> <dbl> <dbl> <chr> <chr> <chr> <date> <dbl>
1 Harold and Maude 1971 Hal Ashby 52 1 1 Ruth Gordon woman 1896-10-30 75
2 Harold and Maude 1971 Hal Ashby 52 1 2 Bud Cort man 1948-03-29 23
3 Venus 2006 Roger Michell 50 1 1 Peter O'Toole man 1932-08-02 74
4 Venus 2006 Roger Michell 50 1 2 Jodie Whittaker woman 1982-06-03 24
5 The Quiet American 2002 Phillip Noyce 49 1 1 Michael Caine man 1933-03-14 69
6 The Quiet American 2002 Phillip Noyce 49 1 2 Do Thi Hai Yen woman 1982-10-01 20
7 The Big Lebowski 1998 Joel Coen 45 1 1 David Huddleston man 1930-09-17 68
8 The Big Lebowski 1998 Joel Coen 45 1 2 Tara Reid woman 1975-11-08 23
9 Beginners 2010 Mike Mills 43 1 1 Christopher Plummer man 1929-12-13 81
10 Beginners 2010 Mike Mills 43 1 2 Goran Visnjic man 1972-09-09 38
11 Poison Ivy 1992 Katt Shea 42 1 1 Tom Skerritt man 1933-08-25 59
12 Poison Ivy 1992 Katt Shea 42 1 2 Drew Barrymore woman 1975-02-22 17
I'm having trouble using the apply function and repeatedly get an error about different sources.
I believe that both sources are data.frames, so I can't work out why it doesn't like the apply inputs.
I would like to get to the output_desired without the need to run them individually. I appreciate I could likely do this with a loop, but am trying to learn some need tricks.
TLDR; I am trying to group the input_s into an object, and then run that dataframe through the function(result) and rbind the outputs.
library(dplyr)
library(tidyr)
## Inputs ##
input_1 = structure(list(V1 = c("Team_2022", "Team_2022", "Team_2022"), V2 = c("Frank", "Mary", "John"), V3 = c("Sydney", "Sydney", "Sydney"), V4 = c(55, 76, 14)), row.names = c(NA, -3L), class = c("data.table", "data.frame"))
input_2 = structure(list(V1 = c("Team_2023", "Team_2023", "Team_2023"), V2 = c("Bill", "Mary", "John"), V3 = c("Sydney", "Sydney", "Sydney"), V4 = c(113, 23, 10)), row.names = c(NA, -3L), class = c("data.table", "data.frame"))
input_3 = structure(list(V1 = c("Team_2024", "Team_2024", "Team_2024"), V2 = c("Frank", "Mary", "Bill"), V3 = c("Sydney", "Sydney", "Sydney"), V4 = c(7, 19, 52)), row.names = c(NA, -3L), class = c("data.table", "data.frame"))
input_4 = structure(list(V1 = c("Team_2025", "Team_2025", "Team_2025"), V2 = c("Frank", "Mary", "John"), V3 = c("Sydney", "Sydney", "Sydney"), V4 = c(46, 44, 88)), row.names = c(NA, -3L), class = c("data.table", "data.frame"))
## Teams ##
teams = structure(list(V1 = c("team1", "team2", "team3"), V2 = c("Mary + Frank","Mary + John", "Mary + Bill")), class = "data.frame", row.names = c(NA, -3L))
## Group the inputs into one ##
all_objects = ls()
input_objects = grep("^input", all_objects, value = T)
input_test = as.data.frame(input_obj)
## Function ##
result = function(input, teams) {
data = teams %>%
separate_rows(V2) %>%
left_join(input, by = c("V2" = "V2")) %>%
replace_na(list(V4 = 0)) %>%
group_by(V1.x) %>% fill(V1.y, V3) %>%
summarize(V1.y = first(V1.y),
V2 = paste(V2, collapse = " + "),
V3 = first(V3),
V4 = sum(V4))
return(data)
}
## Outputs individually ##
output_1 = result(input_1, teams)
output_2 = result(input_2, teams)
output_3 = result(input_3, teams)
output_4 = result(input_4, teams)
## Join outputs ##
output_desired = rbind(output_1, output_2, output_3, output_4)
## Failed apply ##
output_apply = apply(input_test, 1, function(x) {
result(x, teams)
}) %>% do.call("rbind", .)
Using lapply and dplyr::bind_rows you could do:
Note: I also fixed the creation of your input_test list for which I use lapply and get.
library(dplyr)
all_objects <- ls()
input_objects <- grep("^input_\\d", all_objects, value = T)
input_test <- lapply(input_objects, get)
input_test %>%
lapply(result, teams) %>%
bind_rows()
#> # A tibble: 12 × 5
#> V1.x V1.y V2 V3 V4
#> <chr> <chr> <chr> <chr> <dbl>
#> 1 team1 Team_2022 Mary + Frank Sydney 131
#> 2 team2 Team_2022 Mary + John Sydney 90
#> 3 team3 Team_2022 Mary + Bill Sydney 76
#> 4 team1 Team_2023 Mary + Frank Sydney 23
#> 5 team2 Team_2023 Mary + John Sydney 33
#> 6 team3 Team_2023 Mary + Bill Sydney 136
#> 7 team1 Team_2024 Mary + Frank Sydney 26
#> 8 team2 Team_2024 Mary + John Sydney 19
#> 9 team3 Team_2024 Mary + Bill Sydney 71
#> 10 team1 Team_2025 Mary + Frank Sydney 90
#> 11 team2 Team_2025 Mary + John Sydney 132
#> 12 team3 Team_2025 Mary + Bill Sydney 44
Or using purrr::map_df:
purrr::map_df(input_test, result, teams)
I'm relatively new to R and I have a dataframe that looks like this:
1
2
3
4
5
6
7
8
9
10
Name
Max
Max
Max
Joey
Joey
Nancy
Nancy
Nancy
Linda
Linda
Amount_Type
InternetBill
Groceries
WaterBill
InternetBill
Groceries
WaterBill
Groceries
InternetBill
WaterBill
Groceries
Amount
$75
$230.66
$40
$70
$188.75
$35
$175.89
$75
$30
$236.87
I need to add 3 more rows and pivot the dataframe:
The dataframe needs to be grouped by name and outputs 3 totals columns:
Fixed_Cost which should include InternetBill and WaterBill amounts
Variable_Cost which should include Groceries
Total_Cost which should be fixed + variable costs
So something like this:
Name
Fixed_Cost
Variable_Cost
Total_Cost
Max
$115
$230.66
$345.66
Joey
$70
$188.75
$258.75
Nancy
$110
$175.89
$285.89
Linda
$30
$236.87
$266.87
Any advice on how to go about doing this? Thanks!
If we transpose the data, it becomes more easier to do a group by sum
library(data.table)
data.table::transpose(setDT(df1), make.names = 1)[,
Amount := readr::parse_number(Amount)][,
.(Fixed_Cost = sum(Amount[Amount_Type %in% c("InternetBill", "WaterBill")]),
Variable_Cost = sum(Amount[!Amount_Type %in% c("InternetBill", "WaterBill")])),
by = Name][,
Total_Cost := Fixed_Cost + Variable_Cost][]
-output
Name Fixed_Cost Variable_Cost Total_Cost
<char> <num> <num> <num>
1: Max 115 230.66 345.66
2: Joey 70 188.75 258.75
3: Nancy 110 175.89 285.89
4: Linda 30 236.87 266.87
data
df1 <- structure(list(`0` = c("Name", "Amount_Type", "Amount"), `1` = c("Max",
"InternetBill", "$75"), `2` = c("Max", "Groceries", "$230.66"
), `3` = c("Max", "WaterBill", "$40"), `4` = c("Joey", "InternetBill",
"$70"), `5` = c("Joey", "Groceries", "$188.75"), `6` = c("Nancy",
"WaterBill", "$35"), `7` = c("Nancy", "Groceries", "$175.89"),
`8` = c("Nancy", "InternetBill", "$75"), `9` = c("Linda",
"WaterBill", "$30"), `10` = c("Linda", "Groceries", "$236.87"
)), class = "data.frame", row.names = c(NA, -3L))
library(tidyverse)
setNames(data.frame(t(df1[,-1])), df1[,1]) %>%
pivot_wider(Name, names_from = Amount_Type, values_from = Amount,
values_fn = parse_number, values_fill = 0) %>%
mutate(Fixed_cost = InternetBill + WaterBill, variable_cost = Groceries,
Total_Cost = Fixed_cost + variable_cost, .keep ='unused')
# A tibble: 4 x 4
Name Fixed_cost variable_cost Total_Cost
<chr> <dbl> <dbl> <dbl>
1 Max 115 231. 346.
2 Joey 70 189. 259.
3 Nancy 110 176. 286.
4 Linda 30 237. 267.
I'm trying to add specific row of a data frame together.
And short of using grepl to find lines and then rbinding them to the bottom, I'm not sure if there's a better way to do this.
this is my input df:
input = structure(list(
V1 = c("Sales", "Sales", "Sales", "Sales", "Sales","Sales"),
V2 = c("Johnny", "Meg", "Fred", "Johnny", "Meg", "Fred"),
V3 = c("Australia", "Australia", "Australia", "NZ", "NZ","NZ"),
V4 = c(154L, 1898L, 175L, 1235L, 23L, 255L)), row.names = c(NA,6L),
class = "data.frame")
and this is my expected output:
structure(list(
V1 = c("Sales", "Sales", "Sales", "Sales", "Sales",
"Sales", "Sales", "Sales", "Sales", "Sales", "Sales", "Sales"),
V2 = c("Johnny", "Meg", "Fred", "Johnny", "Meg", "Fred", "Johnny + Fred",
"Meg + Fred", "Johnny + Meg + Fred", "Johnny + Fred", "Meg + Fred",
"Johnny + Meg + Fred"),
V3 = c("Australia", "Australia", "Australia", "NZ",
"NZ", "NZ", "Australia", "Australia", "Australia", "NZ", "NZ", "NZ"),
V4 = c(154L, 1898L, 175L, 1235L, 23L, 255L, 329L, 2073L, 2227L, 1490L, 278L, 1513L)),
class = "data.frame", row.names = c(NA, -12L)
)
I would've thought there's a better way to there's a better way of adding these rows that filtering and then adding, and then joining etc.
Can anyone point me in the right direction of what I should be looking for?
I solve the problem using combn
Data input part
input = structure(list(
V1 = c("Sales", "Sales", "Sales", "Sales", "Sales","Sales"),
V2 = c("Johnny", "Meg", "Fred", "Johnny", "Meg", "Fred"),
V3 = c("Australia", "Australia", "Australia", "NZ", "NZ","NZ"),
V4 = c(154L, 1898L, 175L, 1235L, 23L, 255L)), row.names = c(NA,6L),
class = "data.frame")
structure(list(
V1 = c("Sales", "Sales", "Sales", "Sales", "Sales",
"Sales", "Sales", "Sales", "Sales", "Sales", "Sales", "Sales"),
V2 = c("Johnny", "Meg", "Fred", "Johnny", "Meg", "Fred", "Johnny + Fred",
"Meg + Fred", "Johnny + Meg + Fred", "Johnny + Fred", "Meg + Fred",
"Johnny + Meg + Fred"),
V3 = c("Australia", "Australia", "Australia", "NZ",
"NZ", "NZ", "Australia", "Australia", "Australia", "NZ", "NZ", "NZ"),
V4 = c(154L, 1898L, 175L, 1235L, 23L, 255L, 329L, 2073L, 2227L, 1490L, 278L, 1513L)),
class = "data.frame", row.names = c(NA, -12L)
)
Solution
library(dplyr)
TT = unique(input$V2)
> TT
[1] "Johnny" "Meg" "Fred"
comb2 = combn(TT,2,simplify = FALSE)
> comb2
[[1]]
[1] "Johnny" "Meg"
[[2]]
[1] "Johnny" "Fred"
[[3]]
[1] "Meg" "Fred"
comb3 = combn(TT,3,simplify = FALSE)
> comb3
[[1]]
[1] "Johnny" "Meg" "Fred"
result = function(data){
purrr::map_df(lapply(data,function(x){paste(x,collapse = '|')}), function(x){
df = input[grepl(x,input$V2),] %>% group_by(V3)%>%summarize(V1= 'Sales',
V2= paste(V2,collapse = '+'),
V4= sum(V4))
return(df)
}
)
}
Result
result(comb2)
# A tibble: 6 x 4
V3 V1 V2 V4
<chr> <chr> <chr> <int>
1 Australia Sales Johnny+Meg 2052
2 NZ Sales Johnny+Meg 1258
3 Australia Sales Johnny+Fred 329
4 NZ Sales Johnny+Fred 1490
5 Australia Sales Meg+Fred 2073
6 NZ Sales Meg+Fred 278
result(comb3)
# A tibble: 2 x 4
V3 V1 V2 V4
<chr> <chr> <chr> <int>
1 Australia Sales Johnny+Meg+Fred 2227
2 NZ Sales Johnny+Meg+Fred 1513
finalResult = bind_rows(A,B,input) %>%
select(V1,V2,V3,V4) %>% filter(! V2 %in% c('Johnny+Meg'))
> finalResult
# A tibble: 12 x 4
V1 V2 V3 V4
<chr> <chr> <chr> <int>
1 Sales Johnny+Fred Australia 329
2 Sales Johnny+Fred NZ 1490
3 Sales Meg+Fred Australia 2073
4 Sales Meg+Fred NZ 278
5 Sales Johnny+Meg+Fred Australia 2227
6 Sales Johnny+Meg+Fred NZ 1513
7 Sales Johnny Australia 154
8 Sales Meg Australia 1898
9 Sales Fred Australia 175
10 Sales Johnny NZ 1235
11 Sales Meg NZ 23
12 Sales Fred NZ 255
Using tidyverse we can first split the dataframe based on V3 then create combination of names and add sum to create a new tibble and bind it to the original dataframe.
library(tidyverse)
input %>%
bind_rows(input %>%
group_split(V3) %>%
map_dfr(function(x) map_dfr(2:nrow(x), ~tibble(
V1 = first(x$V1),
V2 = combn(x$V2, ., paste, collapse = " + "),
V3 = first(x$V3),
V4 = combn(x$V4, .,sum)) %>%
filter(grepl("\\bFred\\b", V2)))))
# V1 V2 V3 V4
#1 Sales Johnny Australia 154
#2 Sales Meg Australia 1898
#3 Sales Fred Australia 175
#4 Sales Johnny NZ 1235
#5 Sales Meg NZ 23
#6 Sales Fred NZ 255
#7 Sales Johnny + Fred Australia 329
#8 Sales Meg + Fred Australia 2073
#9 Sales Johnny + Meg + Fred Australia 2227
#10 Sales Johnny + Fred NZ 1490
#11 Sales Meg + Fred NZ 278
#12 Sales Johnny + Meg + Fred NZ 1513
Using the same logic but in base R, we can do
rbind(input, do.call(rbind, lapply(split(input, input$V3), function(x)
do.call(rbind, lapply(2:nrow(x), function(y)
subset(data.frame(V1 = x$V1[1],
V2 = combn(x$V2, y, paste, collapse = " + "),
V3 = x$V3[1],
V4 = combn(x$V4, y, sum)),
grepl("\\bFred\\b", V2)))))))
I am trying to aggregate rows in dataframe that have some values similar and others different as below :
dataframe1 <- data.frame(Company_Name = c("KFC", "KFC", "KFC", "McD", "McD"),
Company_ID = c(1, 1, 1, 2, 2),
Company_Phone = c("237389", "-", "-", "237002", "-"),
Employee_Name = c("John", "Mary", "Jane", "Joshua",
"Anne"),
Employee_ID = c(1001, 1002, 1003, 2001, 2002))
I wish to combine the rows for the values that are similar and creating new columns for the values that are different as below:
dataframe2 <- data.frame(Company_Name = c("KFC", "McD"),
Company_ID = c(1, 2),
Company_Phone = c("237389", "237002"),
Employee_Name1 = c("John", "Joshua" ),
Employee_ID1 = c(1001, 2001),
Employee_Name2 = c("Mary", "Anne"),
Employee_ID2 = c(1002, 2002),
Employee_Name3 = c("Jane", "na"),
Employee_ID3 = c(1003, "na"))
I have checked similar questions such as this Combining duplicated rows in R and adding new column containing IDs of duplicates and R: collapse rows and then convert row into a new column but I do not wish to sepoarate the values by commas but rather create new columns.
# Company_Name Company_ID Company_Phone Employee_Name1 Employee_ID1 Employee_Name2 Employee_ID2 Employee_Name3 Employee_ID3
#1 KFC 1 237389 John 1001 Mary 1002 Jane 1003
#2 McD 2 237002 Joshua 2001 Anne 2002 na na
Thank you in advance.
A solution using tidyverse. dat is the final output.
library(tidyverse)
dat <- dataframe1 %>%
mutate_if(is.factor, as.character) %>%
mutate(Company_Phone = ifelse(Company_Phone %in% "-", NA, Company_Phone)) %>%
fill(Company_Phone) %>%
group_by(Company_ID) %>%
mutate(ID = 1:n()) %>%
gather(Info, Value, starts_with("Employee_")) %>%
unite(New_Col, Info, ID, sep = "") %>%
spread(New_Col, Value) %>%
select(c("Company_Name", "Company_ID", "Company_Phone",
paste0(rep(c("Employee_ID", "Employee_Name"), 3), rep(1:3, each = 2)))) %>%
ungroup()
# View the result
dat %>% as.data.frame(stringsAsFactors = FALSE)
# Company_Name Company_ID Company_Phone Employee_ID1 Employee_Name1 Employee_ID2 Employee_Name2 Employee_ID3 Employee_Name3
# 1 KFC 1 237389 1001 John 1002 Mary 1003 Jane
# 2 McD 2 237002 2001 Joshua 2002 Anne <NA> <NA>
We could do this with dcast from data.table which can take multiple value.var columns. Convert the 'data.frame' to 'data.table' (setDT(dataframe1)), grouped by 'Company_Name', replace the 'Company_Phone' _ elements with the first alphanumeric string, then dcast from 'long' to 'wide' by specifying 'Employee_Name' and 'Employee_ID' as the value.var columns
library(data.table)
setDT(dataframe1)[, Company_Phone := first(Company_Phone), Company_Name]
res <- dcast(dataframe1, Company_Name + Company_ID + Company_Phone ~
rowid(Company_Name), value.var = c("Employee_Name", "Employee_ID"), sep='')
-output
res
#Company_Name Company_ID Company_Phone Employee_Name1 Employee_Name2 Employee_Name3 Employee_ID1 Employee_ID2 Employee_ID3
#1: KFC 1 237389 John Mary Jane 1001 1002 1003
#2: McD 2 237002 Joshua Anne NA 2001 2002 NA
If we need to order it
res[, c(1:3, order(as.numeric(sub("\\D+", "", names(res)[-(1:3)]))) + 3), with = FALSE]
# Company_Name Company_ID Company_Phone Employee_Name1 Employee_ID1 Employee_Name2 Employee_ID2 Employee_Name3 Employee_ID3
#1: KFC 1 237389 John 1001 Mary 1002 Jane 1003
#2: McD 2 237002 Joshua 2001 Anne 2002 NA NA
Here is an other approach combining dplyr and cSplit
library(dplyr)
dataframe1 <- dataframe1 %>%
group_by(Company_Name, Company_ID) %>%
summarise_all(funs(paste((.), collapse = ",")))
library(splitstackshape)
dataframe1 <- cSplit(dataframe1, c("Company_Phone", "Employee_Name", "Employee_ID"), ",")
dataframe1
# Company_Name Company_ID Company_Phone_1 Company_Phone_2 Company_Phone_3 Employee_Name_1 Employee_Name_2 Employee_Name_3 Employee_ID_1 Employee_ID_2 Employee_ID_3
#1: KFC 1 237389 - - John Mary Jane 1001 1002 1003
#2: McD 2 237002 - NA Joshua Anne NA 2001 2002 NA