I'm trying to add specific row of a data frame together.
And short of using grepl to find lines and then rbinding them to the bottom, I'm not sure if there's a better way to do this.
this is my input df:
input = structure(list(
V1 = c("Sales", "Sales", "Sales", "Sales", "Sales","Sales"),
V2 = c("Johnny", "Meg", "Fred", "Johnny", "Meg", "Fred"),
V3 = c("Australia", "Australia", "Australia", "NZ", "NZ","NZ"),
V4 = c(154L, 1898L, 175L, 1235L, 23L, 255L)), row.names = c(NA,6L),
class = "data.frame")
and this is my expected output:
structure(list(
V1 = c("Sales", "Sales", "Sales", "Sales", "Sales",
"Sales", "Sales", "Sales", "Sales", "Sales", "Sales", "Sales"),
V2 = c("Johnny", "Meg", "Fred", "Johnny", "Meg", "Fred", "Johnny + Fred",
"Meg + Fred", "Johnny + Meg + Fred", "Johnny + Fred", "Meg + Fred",
"Johnny + Meg + Fred"),
V3 = c("Australia", "Australia", "Australia", "NZ",
"NZ", "NZ", "Australia", "Australia", "Australia", "NZ", "NZ", "NZ"),
V4 = c(154L, 1898L, 175L, 1235L, 23L, 255L, 329L, 2073L, 2227L, 1490L, 278L, 1513L)),
class = "data.frame", row.names = c(NA, -12L)
)
I would've thought there's a better way to there's a better way of adding these rows that filtering and then adding, and then joining etc.
Can anyone point me in the right direction of what I should be looking for?
I solve the problem using combn
Data input part
input = structure(list(
V1 = c("Sales", "Sales", "Sales", "Sales", "Sales","Sales"),
V2 = c("Johnny", "Meg", "Fred", "Johnny", "Meg", "Fred"),
V3 = c("Australia", "Australia", "Australia", "NZ", "NZ","NZ"),
V4 = c(154L, 1898L, 175L, 1235L, 23L, 255L)), row.names = c(NA,6L),
class = "data.frame")
structure(list(
V1 = c("Sales", "Sales", "Sales", "Sales", "Sales",
"Sales", "Sales", "Sales", "Sales", "Sales", "Sales", "Sales"),
V2 = c("Johnny", "Meg", "Fred", "Johnny", "Meg", "Fred", "Johnny + Fred",
"Meg + Fred", "Johnny + Meg + Fred", "Johnny + Fred", "Meg + Fred",
"Johnny + Meg + Fred"),
V3 = c("Australia", "Australia", "Australia", "NZ",
"NZ", "NZ", "Australia", "Australia", "Australia", "NZ", "NZ", "NZ"),
V4 = c(154L, 1898L, 175L, 1235L, 23L, 255L, 329L, 2073L, 2227L, 1490L, 278L, 1513L)),
class = "data.frame", row.names = c(NA, -12L)
)
Solution
library(dplyr)
TT = unique(input$V2)
> TT
[1] "Johnny" "Meg" "Fred"
comb2 = combn(TT,2,simplify = FALSE)
> comb2
[[1]]
[1] "Johnny" "Meg"
[[2]]
[1] "Johnny" "Fred"
[[3]]
[1] "Meg" "Fred"
comb3 = combn(TT,3,simplify = FALSE)
> comb3
[[1]]
[1] "Johnny" "Meg" "Fred"
result = function(data){
purrr::map_df(lapply(data,function(x){paste(x,collapse = '|')}), function(x){
df = input[grepl(x,input$V2),] %>% group_by(V3)%>%summarize(V1= 'Sales',
V2= paste(V2,collapse = '+'),
V4= sum(V4))
return(df)
}
)
}
Result
result(comb2)
# A tibble: 6 x 4
V3 V1 V2 V4
<chr> <chr> <chr> <int>
1 Australia Sales Johnny+Meg 2052
2 NZ Sales Johnny+Meg 1258
3 Australia Sales Johnny+Fred 329
4 NZ Sales Johnny+Fred 1490
5 Australia Sales Meg+Fred 2073
6 NZ Sales Meg+Fred 278
result(comb3)
# A tibble: 2 x 4
V3 V1 V2 V4
<chr> <chr> <chr> <int>
1 Australia Sales Johnny+Meg+Fred 2227
2 NZ Sales Johnny+Meg+Fred 1513
finalResult = bind_rows(A,B,input) %>%
select(V1,V2,V3,V4) %>% filter(! V2 %in% c('Johnny+Meg'))
> finalResult
# A tibble: 12 x 4
V1 V2 V3 V4
<chr> <chr> <chr> <int>
1 Sales Johnny+Fred Australia 329
2 Sales Johnny+Fred NZ 1490
3 Sales Meg+Fred Australia 2073
4 Sales Meg+Fred NZ 278
5 Sales Johnny+Meg+Fred Australia 2227
6 Sales Johnny+Meg+Fred NZ 1513
7 Sales Johnny Australia 154
8 Sales Meg Australia 1898
9 Sales Fred Australia 175
10 Sales Johnny NZ 1235
11 Sales Meg NZ 23
12 Sales Fred NZ 255
Using tidyverse we can first split the dataframe based on V3 then create combination of names and add sum to create a new tibble and bind it to the original dataframe.
library(tidyverse)
input %>%
bind_rows(input %>%
group_split(V3) %>%
map_dfr(function(x) map_dfr(2:nrow(x), ~tibble(
V1 = first(x$V1),
V2 = combn(x$V2, ., paste, collapse = " + "),
V3 = first(x$V3),
V4 = combn(x$V4, .,sum)) %>%
filter(grepl("\\bFred\\b", V2)))))
# V1 V2 V3 V4
#1 Sales Johnny Australia 154
#2 Sales Meg Australia 1898
#3 Sales Fred Australia 175
#4 Sales Johnny NZ 1235
#5 Sales Meg NZ 23
#6 Sales Fred NZ 255
#7 Sales Johnny + Fred Australia 329
#8 Sales Meg + Fred Australia 2073
#9 Sales Johnny + Meg + Fred Australia 2227
#10 Sales Johnny + Fred NZ 1490
#11 Sales Meg + Fred NZ 278
#12 Sales Johnny + Meg + Fred NZ 1513
Using the same logic but in base R, we can do
rbind(input, do.call(rbind, lapply(split(input, input$V3), function(x)
do.call(rbind, lapply(2:nrow(x), function(y)
subset(data.frame(V1 = x$V1[1],
V2 = combn(x$V2, y, paste, collapse = " + "),
V3 = x$V3[1],
V4 = combn(x$V4, y, sum)),
grepl("\\bFred\\b", V2)))))))
Related
I am trying to add the data input variable name as an output value in a separate column ($V5 within my function "result).
The inputs are different data values, and would like to save these names in the outputs so I can track where the data is coming from.
I thought this would be fairly straightforward, and that I could use print(deparse(substitute(input))) but this doesn't work.
Can anyone recommend a solution?
library(dplyr)
library(tidyr)
## Inputs ##
input_1 = structure(list(V1 = c("Team_2022", "Team_2022", "Team_2022"), V2 = c("Frank", "Mary", "John"), V3 = c("Sydney", "Sydney", "Sydney"), V4 = c(55, 76, 14)), row.names = c(NA, -3L), class = c("data.table", "data.frame"))
input_2 = structure(list(V1 = c("Team_2023", "Team_2023", "Team_2023"), V2 = c("Bill", "Mary", "John"), V3 = c("Sydney", "Sydney", "Sydney"), V4 = c(113, 23, 10)), row.names = c(NA, -3L), class = c("data.table", "data.frame"))
input_3 = structure(list(V1 = c("Team_2024", "Team_2024", "Team_2024"), V2 = c("Frank", "Mary", "Bill"), V3 = c("Sydney", "Sydney", "Sydney"), V4 = c(7, 19, 52)), row.names = c(NA, -3L), class = c("data.table", "data.frame"))
input_4 = structure(list(V1 = c("Team_2025", "Team_2025", "Team_2025"), V2 = c("Frank", "Mary", "John"), V3 = c("Sydney", "Sydney", "Sydney"), V4 = c(46, 44, 88)), row.names = c(NA, -3L), class = c("data.table", "data.frame"))
## Teams ##
teams = structure(list(V1 = c("team1", "team2", "team3"), V2 = c("Mary + Frank","Mary + John", "Mary + Bill")), class = "data.frame", row.names = c(NA, -3L))
## Group the inputs into one ##
all_objects = ls()
input_objects = grep("^input", all_objects, value = T)
input_test = as.data.frame(input_obj)
## Function ##
result = function(input, teams) {
data = teams %>%
separate_rows(V2) %>%
left_join(input, by = c("V2" = "V2")) %>%
replace_na(list(V4 = 0)) %>%
group_by(V1.x) %>% fill(V1.y, V3) %>%
summarize(V1.y = first(V1.y),
V2 = paste(V2, collapse = " + "),
V3 = first(V3),
V4 = sum(V4),
V5 = print(deparse(substitute(input))))
return(data)
}
all_objects <- ls()
input_objects <- grep("^input_\\d", all_objects, value = T)
input_test <- lapply(input_objects, get)
output = input_test %>%
lapply(result, teams) %>%
bind_rows()
### Current output ###
structure(list(V1.x = c("team1", "team2", "team3", "team1", "team2",
"team3", "team1", "team2", "team3", "team1", "team2", "team3"
), V1.y = c("Team_2022", "Team_2022", "Team_2022", "Team_2023",
"Team_2023", "Team_2023", "Team_2024", "Team_2024", "Team_2024",
"Team_2025", "Team_2025", "Team_2025"), V2 = c("Mary + Frank",
"Mary + John", "Mary + Bill", "Mary + Frank", "Mary + John",
"Mary + Bill", "Mary + Frank", "Mary + John", "Mary + Bill",
"Mary + Frank", "Mary + John", "Mary + Bill"), V3 = c("Sydney",
"Sydney", "Sydney", "Sydney", "Sydney", "Sydney", "Sydney", "Sydney",
"Sydney", "Sydney", "Sydney", "Sydney"), V4 = c(131, 90, 76,
23, 33, 136, 26, 19, 71, 90, 132, 44), V5 = c("input", "input",
"input", "input", "input", "input", "input", "input", "input",
"input", "input", "input")), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -12L))
### Desired Output ###
structure(list(V1.x = c("team1", "team2", "team3", "team1", "team2",
"team3", "team1", "team2", "team3", "team1", "team2", "team3"
), V1.y = c("Team_2022", "Team_2022", "Team_2022", "Team_2023",
"Team_2023", "Team_2023", "Team_2024", "Team_2024", "Team_2024",
"Team_2025", "Team_2025", "Team_2025"), V2 = c("Mary + Frank",
"Mary + John", "Mary + Bill", "Mary + Frank", "Mary + John",
"Mary + Bill", "Mary + Frank", "Mary + John", "Mary + Bill",
"Mary + Frank", "Mary + John", "Mary + Bill"), V3 = c("Sydney",
"Sydney", "Sydney", "Sydney", "Sydney", "Sydney", "Sydney", "Sydney",
"Sydney", "Sydney", "Sydney", "Sydney"), V4 = c(131, 90, 76,
23, 33, 136, 26, 19, 71, 90, 132, 44), V5 = c("input_1", "input_1",
"input_1", "input_2", "input_2", "input_2", "input_3", "input_3", "input_3",
"input_4", "input_4", "input_4")), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -12L))
If I understand you correctly: the purrr::map_df() function has a nice feature to identify inputs lists (ideally named) in final data.frames:
library(dplyr)
library(tidyr)
# dropping V5 as it will be "automatically" computed
result = function(input, teams) {
data = teams %>%
separate_rows(V2) %>%
left_join(input, by = c("V2" = "V2")) %>%
replace_na(list(V4 = 0)) %>%
group_by(V1.x) %>%
fill(V1.y, V3) %>%
summarize(V1.y = first(V1.y),
V2 = paste(V2, collapse = " + "),
V3 = first(V3),
V4 = sum(V4)
# we do not need V5 here anymore
)
return(data)
}
l_objects <- ls()
input_objects <- grep("^input_\\d", all_objects, value = T)
input_test <- lapply(input_objects, get)
# name the object list to use the name for identification
names(input_test) <- input_objects
# use purrr map to data.frame with the .id feature
purrr::map_df(input_test, ~result(.x, teams), .id = "V5")
V5 V1.x V1.y V2 V3 V4
<chr> <chr> <chr> <chr> <chr> <dbl>
1 input_1 team1 Team_2022 Mary + Frank Sydney 131
2 input_1 team2 Team_2022 Mary + John Sydney 90
3 input_1 team3 Team_2022 Mary + Bill Sydney 76
4 input_2 team1 Team_2023 Mary + Frank Sydney 23
5 input_2 team2 Team_2023 Mary + John Sydney 33
6 input_2 team3 Team_2023 Mary + Bill Sydney 136
7 input_3 team1 Team_2024 Mary + Frank Sydney 26
8 input_3 team2 Team_2024 Mary + John Sydney 19
9 input_3 team3 Team_2024 Mary + Bill Sydney 71
10 input_4 team1 Team_2025 Mary + Frank Sydney 90
11 input_4 team2 Team_2025 Mary + John Sydney 132
12 input_4 team3 Team_2025 Mary + Bill Sydney 44
Note that this works without naming the list aswell, though you will only get the list item number, which might be insufficient.
Also the plyr::ldply function can be used to bind named lists, generating a new column with the list names in the result data.frame.
Instead of doing the deparse/substitute, create an argument in result for the names as well, and then use that
library(dplyr)
library(purrr)
library(tidyr)
result <- function(input, teams, inputnm) {
data = teams %>%
separate_rows(V2) %>%
left_join(input, by = c("V2" = "V2")) %>%
replace_na(list(V4 = 0)) %>%
group_by(V1.x) %>% fill(V1.y, V3) %>%
summarize(V1.y = first(V1.y),
V2 = paste(V2, collapse = " + "),
V3 = first(V3),
V4 = sum(V4),
V5 = inputnm)
return(data)
}
-testing
input_test %>%
pull(input_objects) %>%
mget(inherits = TRUE) %>%
imap_dfr(~ result(.x, teams, .y))
-output
# A tibble: 12 × 6
V1.x V1.y V2 V3 V4 V5
<chr> <chr> <chr> <chr> <dbl> <chr>
1 team1 Team_2022 Mary + Frank Sydney 131 input_1
2 team2 Team_2022 Mary + John Sydney 90 input_1
3 team3 Team_2022 Mary + Bill Sydney 76 input_1
4 team1 Team_2023 Mary + Frank Sydney 23 input_2
5 team2 Team_2023 Mary + John Sydney 33 input_2
6 team3 Team_2023 Mary + Bill Sydney 136 input_2
7 team1 Team_2024 Mary + Frank Sydney 26 input_3
8 team2 Team_2024 Mary + John Sydney 19 input_3
9 team3 Team_2024 Mary + Bill Sydney 71 input_3
10 team1 Team_2025 Mary + Frank Sydney 90 input_4
11 team2 Team_2025 Mary + John Sydney 132 input_4
12 team3 Team_2025 Mary + Bill Sydney 44 input_4
If you just had one input at a time, simply moving the line for assigning V5 outside of your summarize and dplyr pipes would do it:
result = function(input, teams) {
data = teams %>%
separate_rows(V2) %>%
left_join(input, by = c("V2" = "V2")) %>%
replace_na(list(V4 = 0)) %>%
group_by(V1.x) %>% fill(V1.y, V3) %>%
summarize(V1.y = first(V1.y),
V2 = paste(V2, collapse = " + "),
V3 = first(V3),
V4 = sum(V4),
#V5 = print(deparse(substitute(input)))
)
data$V5 <- deparse(substitute(input))
return(data)
}
result(input_1, teams)
# V1.x V1.y V2 V3 V4 V5
# <chr> <chr> <chr> <chr> <dbl> <chr>
# 1 team1 Team_2022 Mary + Frank Sydney 131 input_1
# 2 team2 Team_2022 Mary + John Sydney 90 input_1
# 3 team3 Team_2022 Mary + Bill Sydney 76 input_1
But your use of lists makes that a bit challenging (it will output X[[i]] for that code).
To address this for using lapply, I would suggest simply adding an additional input that takes the name and simply assigning it that name, with additional tweaks to the lapply function to accommodate it:
result = function(input, teams, nme) {
data = teams %>%
separate_rows(V2) %>%
left_join(input, by = c("V2" = "V2")) %>%
replace_na(list(V4 = 0)) %>%
group_by(V1.x) %>% fill(V1.y, V3) %>%
summarize(V1.y = first(V1.y),
V2 = paste(V2, collapse = " + "),
V3 = first(V3),
V4 = sum(V4),
#V5 = print(deparse(substitute(input)))
)
data$V5 <- nme
return(data)
}
all_objects <- ls()
input_objects <- grep("^input_\\d", all_objects, value = T)
input_test <- lapply(input_objects, get)
# add in assigning names to the list
names(input_test) <- input_objects
output = lapply(input_objects, function(x) result(input_test[[x]], teams, nme = x)) %>%
bind_rows()
# V1.x V1.y V2 V3 V4 V5
# <chr> <chr> <chr> <chr> <dbl> <chr>
# 1 team1 Team_2022 Mary + Frank Sydney 131 input_1
# 2 team2 Team_2022 Mary + John Sydney 90 input_1
# 3 team3 Team_2022 Mary + Bill Sydney 76 input_1
# 4 team1 Team_2023 Mary + Frank Sydney 23 input_2
# 5 team2 Team_2023 Mary + John Sydney 33 input_2
# 6 team3 Team_2023 Mary + Bill Sydney 136 input_2
# 7 team1 Team_2024 Mary + Frank Sydney 26 input_3
# 8 team2 Team_2024 Mary + John Sydney 19 input_3
# 9 team3 Team_2024 Mary + Bill Sydney 71 input_3
# 10 team1 Team_2025 Mary + Frank Sydney 90 input_4
# 11 team2 Team_2025 Mary + John Sydney 132 input_4
# 12 team3 Team_2025 Mary + Bill Sydney 44 input_4
I'm having trouble using the apply function and repeatedly get an error about different sources.
I believe that both sources are data.frames, so I can't work out why it doesn't like the apply inputs.
I would like to get to the output_desired without the need to run them individually. I appreciate I could likely do this with a loop, but am trying to learn some need tricks.
TLDR; I am trying to group the input_s into an object, and then run that dataframe through the function(result) and rbind the outputs.
library(dplyr)
library(tidyr)
## Inputs ##
input_1 = structure(list(V1 = c("Team_2022", "Team_2022", "Team_2022"), V2 = c("Frank", "Mary", "John"), V3 = c("Sydney", "Sydney", "Sydney"), V4 = c(55, 76, 14)), row.names = c(NA, -3L), class = c("data.table", "data.frame"))
input_2 = structure(list(V1 = c("Team_2023", "Team_2023", "Team_2023"), V2 = c("Bill", "Mary", "John"), V3 = c("Sydney", "Sydney", "Sydney"), V4 = c(113, 23, 10)), row.names = c(NA, -3L), class = c("data.table", "data.frame"))
input_3 = structure(list(V1 = c("Team_2024", "Team_2024", "Team_2024"), V2 = c("Frank", "Mary", "Bill"), V3 = c("Sydney", "Sydney", "Sydney"), V4 = c(7, 19, 52)), row.names = c(NA, -3L), class = c("data.table", "data.frame"))
input_4 = structure(list(V1 = c("Team_2025", "Team_2025", "Team_2025"), V2 = c("Frank", "Mary", "John"), V3 = c("Sydney", "Sydney", "Sydney"), V4 = c(46, 44, 88)), row.names = c(NA, -3L), class = c("data.table", "data.frame"))
## Teams ##
teams = structure(list(V1 = c("team1", "team2", "team3"), V2 = c("Mary + Frank","Mary + John", "Mary + Bill")), class = "data.frame", row.names = c(NA, -3L))
## Group the inputs into one ##
all_objects = ls()
input_objects = grep("^input", all_objects, value = T)
input_test = as.data.frame(input_obj)
## Function ##
result = function(input, teams) {
data = teams %>%
separate_rows(V2) %>%
left_join(input, by = c("V2" = "V2")) %>%
replace_na(list(V4 = 0)) %>%
group_by(V1.x) %>% fill(V1.y, V3) %>%
summarize(V1.y = first(V1.y),
V2 = paste(V2, collapse = " + "),
V3 = first(V3),
V4 = sum(V4))
return(data)
}
## Outputs individually ##
output_1 = result(input_1, teams)
output_2 = result(input_2, teams)
output_3 = result(input_3, teams)
output_4 = result(input_4, teams)
## Join outputs ##
output_desired = rbind(output_1, output_2, output_3, output_4)
## Failed apply ##
output_apply = apply(input_test, 1, function(x) {
result(x, teams)
}) %>% do.call("rbind", .)
Using lapply and dplyr::bind_rows you could do:
Note: I also fixed the creation of your input_test list for which I use lapply and get.
library(dplyr)
all_objects <- ls()
input_objects <- grep("^input_\\d", all_objects, value = T)
input_test <- lapply(input_objects, get)
input_test %>%
lapply(result, teams) %>%
bind_rows()
#> # A tibble: 12 × 5
#> V1.x V1.y V2 V3 V4
#> <chr> <chr> <chr> <chr> <dbl>
#> 1 team1 Team_2022 Mary + Frank Sydney 131
#> 2 team2 Team_2022 Mary + John Sydney 90
#> 3 team3 Team_2022 Mary + Bill Sydney 76
#> 4 team1 Team_2023 Mary + Frank Sydney 23
#> 5 team2 Team_2023 Mary + John Sydney 33
#> 6 team3 Team_2023 Mary + Bill Sydney 136
#> 7 team1 Team_2024 Mary + Frank Sydney 26
#> 8 team2 Team_2024 Mary + John Sydney 19
#> 9 team3 Team_2024 Mary + Bill Sydney 71
#> 10 team1 Team_2025 Mary + Frank Sydney 90
#> 11 team2 Team_2025 Mary + John Sydney 132
#> 12 team3 Team_2025 Mary + Bill Sydney 44
Or using purrr::map_df:
purrr::map_df(input_test, result, teams)
I have a large DF with certain columns that have a vector of character values as below. The number of columns varies from dataset to dataset as well as the number of character vectors it holds also varies.
ID Country1 Country2 Country3
1 1 Argentina, Japan,USA,Poland, Argentina,USA Pakistan
2 2 Colombia, Mexico,Uruguay,Dutch Mexico,Uruguay Afganisthan
3 3 Argentina, Japan,USA,NA Japan Khazagistan
4 4 Colombia, Mexico,Uruguay,Dutch Colombia, Dutch North Korea
5 5 India, China China Iran
Would like to match them one-to-one with another string vector as below
vals_to_find <-c("Argentina","USA","Mexico")
If, a column/row matches to anyone of the strings passed would like to retain that column and row. Remove duplicates, and finally remove those values that do not match.
the desired output is as follows
ID Countries.found
1 1 Argentina, USA
2 2 Mexico
3 3 Argentina, USA
4 4 Mexico
data
dput(df)
structure(list(ID = 1:5, Country1 = c("Argentina, Japan,USA,Poland,",
"Colombia, Mexico,Uruguay,Dutch", "Argentina, Japan,USA,NA",
"Colombia, Mexico,Uruguay,Dutch", "India, China"), Country2 = c("Argentina,USA",
"Mexico,Uruguay", "Japan", "Colombia, Dutch", "China"), Country3 = c("Pakistan",
"Afganisthan", "Khazagistan", "North Korea", "Iran")), class = "data.frame", row.names = c(NA,
-5L))
dput(df_out)
structure(list(ID = 1:4, Countries.found = c("Argentina, USA",
"Mexico", "Argentina, USA", "Mexico")), class = "data.frame", row.names = c(NA,
-4L))
Instead of a each column as a vector, if the file is read as one value per column. Then, was able do it as below
dput(df_out)
structure(list(ID = 1:5, X1 = c("Argentina", "Colombia", "Argentina",
"Colombia", "India"), X2 = c("Japan", "Mexico", "Japan", "Mexico",
"China"), X3 = c("USA", "Uruguay", "USA", "Uruguay", NA), X4 = c("Poland",
"Dutch", NA, "Dutch", NA), X5 = c("Argentina", "Mexico", "Japan",
"Colombia", "China"), X6 = c("USA", "Uruguay", NA, "Dutch", NA
), X7 = c("Pakistan", "Afganisthan", "Khazagistan", "North Korea",
"Iran")), class = "data.frame", row.names = c(NA, -5L))
df_out %>%
dplyr::select(
where(~ !all(is.na(.x)))
) %>%
dplyr::select(c(1, where(~ any(.x %in% vals_to_find)))) %>%
dplyr::mutate(dplyr::across(
tidyselect::starts_with("X"),
~ vals_to_find[match(., vals_to_find)]
)) %>%
tidyr::unite("countries_found", tidyselect::starts_with("X"),
sep = " | ", remove = TRUE, na.rm = TRUE
)
Output
ID countries_found
1 1 Argentina | USA | Argentina | USA
2 2 Mexico | Mexico
3 3 Argentina | USA
4 4 Mexico
unite the "Country" columns, then create a long vector by separating the values into rows, get all distinct values per ID, filter only those who are in vals_to_find, and summarise each countries.found toString.
library(tidyr)
library(dplyr)
df %>%
unite("Country", starts_with("Country"), sep = ",") %>%
separate_rows(Country) %>%
distinct(ID, Country) %>%
filter(Country %in% vals_to_find) %>%
group_by(ID) %>%
summarise(Countries.found = toString(Country))
output
# A tibble: 4 × 2
ID Countries.found
<int> <chr>
1 1 Argentina, USA
2 2 Mexico
3 3 Argentina, USA
4 4 Mexico
We may use
library(dplyr)
library(tidyr)
library(stringr)
df %>%
mutate(across(starts_with("Country"),
~ str_extract_all(.x, str_c(vals_to_find, collapse = "|")))) %>%
pivot_longer(cols = -ID, names_to = NULL,
values_to = 'Countries.found') %>%
unnest(Countries.found) %>%
distinct %>%
group_by(ID) %>%
summarise(Countries.found = toString(Countries.found))
-output
# A tibble: 4 × 2
ID Countries.found
<int> <chr>
1 1 Argentina, USA
2 2 Mexico
3 3 Argentina, USA
4 4 Mexico
I have data that is structure like such:
actor_data <- structure(list(id = c(123L, 456L, 789L, 912L, 235L), name = c("Tom Cruise",
"Will Smith", "Ryan Reynolds", "Chris Rock", "Emma Stone"), locationid1 = c(5459L,
NA, 6114L, NA, NA), location1 = c("Paris, France", "", "Brooklyn, NY",
"", ""), locationid2 = c(NA, 5778L, NA, NA, 4432L), location3 = c("",
"Dolby Theater", "", "", "Hollywood"), locationid3 = c(NA, 2526L,
3101L, NA, NA), location3.1 = c("", "London", "Boston", "", ""
), locationid4 = c(6667L, 2333L, 1118L, NA, NA), location4 = c("Virginia",
"Maryland", "Washington", "", "")), class = "data.frame", row.names = c(NA,
-5L))
I am trying to make the location data run vertically instead of horizontally while also making sure its not accounting for blank fields.
So the final result will look like this:
actor_data_exp <- structure(list(id = c(123L, 123L, 456L, 456L, 456L, 789L, 789L,
789L, 235L), name = c("Tom Cruise", "Tom Cruise", "Will Smith",
"Will Smith", "Will Smith", "Ryan Reynolds", "Ryan Reynolds",
"Ryan Reynolds", "Emma Stone"), locationid = c(5459L, 6667L,
5778L, 2526L, 2333L, 6114L, 3101L, 1118L, 4432L), location = c("Paris, France",
"Virginia", "Dolby Theater", "London", "Maryland", "Brooklyn, NY",
"Boston", "Washington", "Hollywood")), class = "data.frame", row.names = c(NA,
-9L))
Or to give you a visual, will end up looking like this:
I would rename the columns that start with "location", so that there is an underscore before the number in each column name. Then use pivot_longer with the underscore as name_sep, and using c(".value", "var") in the names_to argument that to ensure both location and locationid have their own columns. This will also create the redundant column var which will contain the numbers 1-4 that were appended to the original column names.
Finally, filter out missing values and remove the redundant var column.
library(tidyverse)
actor_data %>%
rename_with(~ ifelse(grepl("location", .x),
sub("^(.*?)([0-9]\\.?\\d?)$", "\\1_\\2", .x), .x)) %>%
pivot_longer(starts_with("location"),
names_sep = "_", names_to = c(".value", "var")) %>%
filter(!is.na(locationid) & !is.na(location) & nzchar(location)) %>%
select(-var)
#> # A tibble: 6 x 4
#> id name locationid location
#> <int> <chr> <int> <chr>
#> 1 123 Tom Cruise 5459 Paris, France
#> 2 123 Tom Cruise 6667 Virginia
#> 3 456 Will Smith 2526 Dolby Theater
#> 4 456 Will Smith 2333 Maryland
#> 5 789 Ryan Reynolds 6114 Brooklyn, NY
#> 6 789 Ryan Reynolds 1118 Washington
I currently need to translate my dplyr code into base R code. My dplyr code gives me 3 columns, competitor sex, the olympic season and the number of different sports. The code looks like this:
olympics %>%
group_by(Sex, Season, Sport) %>%
summarise(n()) %>%
group_by(Sex, Season) %>%
summarise(n()) %>%
setNames(c("Competitor_Sex", "Olympic_Season", "Num_Sports"))
My data structure looks like this.
structure(list(Name = c("A Lamusi", "Juhamatti Tapio Aaltonen",
"Andreea Aanei", "Jamale (Djamel-) Aarrass (Ahrass-)", "Nstor Abad Sanjun",
"Nstor Abad Sanjun"), Sex = c("M", "M", "F", "M", "M", "M"),
Age = c(23L, 28L, 22L, 30L, 23L, 23L), Height = c(170L, 184L,
170L, 187L, 167L, 167L), Weight = c(60, 85, 125, 76, 64,
64), Team = c("China", "Finland", "Romania", "France", "Spain",
"Spain"), NOC = c("CHN", "FIN", "ROU", "FRA", "ESP", "ESP"
), Games = c("2012 Summer", "2014 Winter", "2016 Summer",
"2012 Summer", "2016 Summer", "2016 Summer"), Year = c(2012L,
2014L, 2016L, 2012L, 2016L, 2016L), Season = c("Summer",
"Winter", "Summer", "Summer", "Summer", "Summer"), City = c("London",
"Sochi", "Rio de Janeiro", "London", "Rio de Janeiro", "Rio de Janeiro"
), Sport = c("Judo", "Ice Hockey", "Weightlifting", "Athletics",
"Gymnastics", "Gymnastics"), Event = c("Judo Men's Extra-Lightweight",
"Ice Hockey Men's Ice Hockey", "Weightlifting Women's Super-Heavyweight",
"Athletics Men's 1,500 metres", "Gymnastics Men's Individual All-Around",
"Gymnastics Men's Floor Exercise"), Medal = c(NA, "Bronze",
NA, NA, NA, NA), BMI = c(20.7612456747405, 25.1063327032136,
43.2525951557093, 21.7335354170837, 22.9481157445588, 22.9481157445588
)), .Names = c("Name", "Sex", "Age", "Height", "Weight",
"Team", "NOC", "Games", "Year", "Season", "City", "Sport", "Event",
"Medal", "BMI"), row.names = c(NA, 6L), class = "data.frame")
Does anyone know how to translate this into base R?
Since you are grouping twice in dplyr you can use double aggregate in base R
setNames(aggregate(Name~Sex + Season,
aggregate(Name~Sex + Season + Sport, olympics, length), length),
c("Competitor_Sex", "Olympic_Season", "Num_Sports"))
# Competitor_Sex Olympic_Season Num_Sports
#1 F Summer 1
#2 M Summer 3
#3 M Winter 1
This gives the same output as dplyr option
library(dplyr)
olympics %>%
group_by(Sex, Season, Sport) %>%
summarise(n()) %>%
group_by(Sex, Season) %>%
summarise(n()) %>%
setNames(c("Competitor_Sex", "Olympic_Season", "Num_Sports"))
# Competitor_Sex Olympic_Season Num_Sports
# <chr> <chr> <int>
#1 F Summer 1
#2 M Summer 3
#3 M Winter 1
A base R option would be using aggregate twice
out <- aggregate(BMI ~ Sex + Season,
aggregate(BMI ~ Sex + Season + Sport, olympics, length), length)
names(out) <- c("Competitor_Sex", "Olympic_Season", "Num_Sports")
out
# Competitor_Sex Olympic_Season Num_Sports
#1 F Summer 1
#2 M Summer 3
#3 M Winter 1
It is similar to the OP's output
olympics %>%
group_by(Sex, Season, Sport) %>%
summarise(n()) %>%
group_by(Sex, Season) %>%
summarise(n()) %>%
setNames(c("Competitor_Sex", "Olympic_Season", "Num_Sports"))
# A tibble: 3 x 3
# Groups: Sex [2]
# Competitor_Sex Olympic_Season Num_Sports
# <chr> <chr> <int>
#1 F Summer 1
#2 M Summer 3
#3 M Winter 1
Or it can be done in a compact way with table from base R
table(sub(",[^,]+$", "", names(table(do.call(paste,
c(olympics[c("Sex", "Season", "Sport")], sep=","))))))
# F,Summer M,Summer M,Winter
# 1 3 1