I have the following dfs
df1 <- data.frame(name= c("mark", "peter", "lily"), col1= c(1,2,3),col2= c(4,5,6))
df2 <- data.frame(name= c("mark", "liam", "peter"), col1= c(7,8,9),col2= c(1,2,3))
df3 <- data.frame(name= c("felix", "liam", "peter"), col1= c(3,5,8),col2= c(1,5,8))
df4 <- data.frame(name= c("felix", "lily", "liam"), col1= c(6,2,6),col2= c(4,2,2))
df_list <- list(df1,df2,df3,df4)
I use this code for calculations on two consecutive dfs:
It matches the "rows" of two consecutive dfs by "name" and calcs a ratio of col1 between those. It then returns the results as a new column to the dfs.
df_combined <- df1 %>%
left_join(df2, by="name") %>%
mutate(combined=(col1.x/col1.y)) %>%
filter(!is.na(combined)) %>%
select(name,combined)
add_match_column<-function(df){
df %>% left_join(df_combined)
}
df_list_matched <- df_list %>%
map(add_match_column)
Now I want to iterate over all other columns with the same function. Thus, col2 of two consecutive dfs in the next step and so on (my dfs have 100+ columns and all dfs have the same structure).
It appears as though you have panel data, where your observations for each period are in separate data frames. Now you want to calculate for individual i in period t the ratio between x_it and x_i(t-1).
library(tidyverse)
# It is better to import all your df's into one list instead of separately assigning them
d <- list(
data.frame(name= c("mark", "peter", "lily"), col1= c(1,2,3),col2= c(4,5,6)),
data.frame(name= c("mark", "liam", "peter"), col1= c(7,8,9),col2= c(1,2,3)),
data.frame(name= c("felix", "liam", "peter"), col1= c(3,5,8),col2= c(1,5,8)),
data.frame(name= c("felix", "lily", "liam"), col1= c(6,2,6),col2= c(4,2,2))
)
d |>
bind_rows(.id = "t") |> # Bind to one long df
arrange(name, t) |>
complete(name, t) |> # add implicit NA's
group_by(name) |>
mutate(across(where(is.numeric), ~ .x/lag(.x))) |>
ungroup() |>
filter(!is.na(col1))
#> # A tibble: 6 × 4
#> name t col1 col2
#> <chr> <chr> <dbl> <dbl>
#> 1 felix 4 2 4
#> 2 liam 3 0.625 2.5
#> 3 liam 4 1.2 0.4
#> 4 mark 2 7 0.25
#> 5 peter 2 4.5 0.6
#> 6 peter 3 0.889 2.67
Created on 2022-01-18 by the reprex package (v2.0.1)
Related
Having a dataframe with have the gender of specific names
dfgender <- data.frame(name = c("Helen","Erik"), gender = c("F","M"))
How is it possible to use the previous data frame in order to check the names of another column of a dataframe and insert "Neutral" if the name is not in the list of gender dataframe:
Example of the dataframe with the names:
dfnames <- data.frame(names = c("Helen", "Von", "Erik", "Brook"))
Example of expected output
dfnames <- data.frame(name = c("Helen", "Von", "Erik", "Brook"), gender = c("F", "Neutral", "M", "Neutral"))
left_join + replace_na should do:
dfnames %>% left_join(dfgender, by=c('names' = 'name')) %>%
mutate(gender = gender %>% as.character %>% replace_na('Neutral'))
# names gender
# 1 Helen F
# 2 Von Neutral
# 3 Erik M
# 4 Brook Neutral
The (experimental) rows_update could be an intuitive compliment to #Juan C's answer:
library(dplyr)
dfnames |>
mutate(gender = "Neutral") |>
rows_update(rename(dfgender, names = name), "names")
Output:
names gender
1 Helen F
2 Von Neutral
3 Erik M
4 Brook Neutral
Here's a solution similar to Juan C's but with, I think, a simpler replacement of NAs:
library(dplyr)
library(tidyr)
dfgender <- data.frame(name = c("Helen","Erik"), gender = c("F","M"))
dfnames <- data.frame(names = c("Helen", "Von", "Erik", "Brook"))
dfnames %>%
left_join(dfgender, by = c("names" = "name")) %>%
replace_na(list(gender = "Neutral"))
# names gender
# 1 Helen F
# 2 Von Neutral
# 3 Erik M
# 4 Brook Neutral
And here's another solution with no tidyr dependency:
library(dplyr)
dfgender <- data.frame(name = c("Helen","Erik"), gender = c("F","M"))
dfnames <- data.frame(names = c("Helen", "Von", "Erik", "Brook"))
dfnames %>%
left_join(dfgender, by = c("names" = "name")) %>%
mutate(gender = coalesce(gender, "Neutral"))
# names gender
# 1 Helen F
# 2 Von Neutral
# 3 Erik M
# 4 Brook Neutral
I have:
group
items
value
grp1
A
1
grp1
B
2
grp2
B
3
I want:
group
items
value
grp1
A
1
grp1
B
2
grp1
C
NA
grp2
A
NA
grp2
B
3
grp2
C
NA
"group" is taken from the input df. "items" is taken from a codelist vector with all possible entries, all other columns are filled in where known or else NA.
Example:
item_codelist <- c("A", "B", "C")
input <- data.frame("group" = c("grp1", "grp1", "grp2"), "items" = c("A", "B", "B"), "values" = c(1, 2, 3))
I looked into fill(), extend() and complete() but could not get any of these to work for this purpose.
Below is my current workaround but I find it somewhat complicated and I am using a for loop which will take forever for my 200 MB data frame...
If you know an easier way to do this (preferably in dplyr syntax) let me know. Thanks!
# create a data frame with all groups and items
codelist_df <- input %>% head(0) %>% select(group, items)
for (grp in unique(input$group)){
df <- data.frame("items" = item_codelist) %>%
mutate( group = grp, .before = 1)
codelist_df <- bind_rows(codelist_df, df)
}
# join that data frame to the input data
output <- input %>%
group_by(group) %>%
full_join(codelist_df) %>%
arrange(group, items)
Stefan's comment is by far the best solution, which I was unaware of, but here's one option:
library(dplyr)
library(purrr)
library(tidyr)
input <- data.frame("group" = c("grp1", "grp1", "grp2"), "items" = c("A", "B", "B"), "values" = c(1, 2, 3))
items <- c("A", "B", "C")
input %>%
split(.$group) %>%
map_df(~full_join(., as_tibble(items), by = c("items" = "value")) %>%
arrange(items)) %>%
fill(group, .direction = 'down')
#> group items values
#> 1 grp1 A 1
#> 2 grp1 B 2
#> 3 grp1 C NA
#> 4 grp1 A NA
#> 5 grp2 B 3
#> 6 grp2 C NA
It seemse like you want to cross join the groups and items. To do that, you could use dplyr::full_join() with the argument by = character(), and then left join the values back in:
library(dplyr, warn.conflicts = FALSE)
item_codelist <- tibble(items = c('A', 'B', 'C'))
groups <- tibble(group = c('grp1', 'grp1', 'grp2'))
input <- tibble("group" = c("grp1", "grp1", "grp2"), "items" = c("A", "B", "B"), "values" = c(1, 2, 3))
item_codelist |>
full_join(groups, by = character()) |>
left_join(input, by = c('items', 'group')) |>
relocate(group) |>
arrange(group, items) |>
distinct()
#> # A tibble: 6 × 3
#> group items values
#> <chr> <chr> <dbl>
#> 1 grp1 A 1
#> 2 grp1 B 2
#> 3 grp1 C NA
#> 4 grp2 A NA
#> 5 grp2 B 3
#> 6 grp2 C NA
Created on 2022-07-11 by the reprex package (v2.0.1)
Is there a faster way to do the following, where in the real application, df has many rows (and therefore list_of_colnames has the same number of elements):
list_of_colnames <- list(c("A", "B"), c("A"))
some_vector <- c("fish", "cat")
map2(split(df, seq(nrow(df))), list_of_colnames, function(row, colnames) {
row$indicator <- ifelse(any(row[, colnames] %in% some_vector), 1, 0)
return(row)
})
While this current implementation works, it takes centuries for the big df. In fact I think split() is a major bottleneck.
Thank you!
One option may be to make use of row/column indexing
rowind <- rep(seq_len(nrow(df)), lengths(list_of_colnames) * nrow(df))
df$indicator <- +(tapply(c(t(df[unlist(list_of_colnames)])) %in% some_vector,
rowind, FUN = any))
-output
> df
A B indicator
1 fish A 1
2 hello cat 1
data
df <- data.frame(A = c('fish', 'hello'), B = c('A', 'cat'))
You can avoid splitting your data frame into a list all together and instead apply your condition across the rows using rowwise and c_across from dplyr:
library(dplyr)
library(purrr)
list_of_colnames <- list(c("A", "B"), c("A"))
some_vector <- c("fish", "cat")
map(list_of_colnames, ~
df %>%
rowwise() %>%
mutate(indicator = as.numeric(any(c_across(all_of(.x)) %in% some_vector))) %>%
ungroup()
)
Output
Still mapping over list_of_columns returns a list output:
[[1]]
# A tibble: 3 x 4
A B C indicator
<chr> <chr> <chr> <lgl>
1 fish dog bird TRUE
2 dog cat bird TRUE
3 bird lion cat FALSE
[[2]]
# A tibble: 3 x 4
A B C indicator
<chr> <chr> <chr> <lgl>
1 fish dog bird TRUE
2 dog cat bird FALSE
3 bird lion cat FALSE
Data
structure(list(A = c("fish", "dog", "bird"), B = c("dog", "cat",
"lion"), C = c("bird", "bird", "cat")), class = "data.frame", row.names = c(NA,
-3L))
library(purrr)
library(tibble)
library(dplyr)
Starting list of dataframes
lst <- list(df1 = data.frame(X.1 = as.character(1:2),
heading = letters[1:2]),
df2 = data.frame(X.32 = as.character(3:4),
another.topic = paste("Line ", 1:2)))
lst
#> $df1
#> X.1 heading
#> 1 1 a
#> 2 2 b
#>
#> $df2
#> X.32 another.topic
#> 1 3 Line 1
#> 2 4 Line 2
Expected "combined" dataframe, with new consistent variable names, and old variable names in the first row of each constituent dataframe.
#> id h1 h2
#> 1 df1 X.1 heading
#> 2 df1 1 a
#> 3 df1 2 b
#> 4 df2 X.32 another.topic
#> 5 df2 3 Line 1
#> 6 df2 4 Line 2
add_row requires "Name-value pairs, passed on to tibble(). Values can be defined only for columns that already exist in .data and unset columns will get an NA value."
Which is what I think I have achieved with this:
df_nms <-
map(lst, names) %>%
map(set_names)
#> $df1
#> X.1 heading
#> "X.1" "heading"
#>
#> $df2
#> X.32 another.topic
#> "X.32" "another.topic"
But I cannot tie up the last bit, using a purrr function to add the names to the head of each dataframe. I've tried numerous variations with map2 and pmap the closest I can get at present (if I treat add_row as a formula , prefixing it with ~ and remove the .y I get a new first row populated with NAs). I think I'm missing how to pass the name-value pairs to the add_row function.
map2(lst, df_nms, add_row(.x, .y, .before = 1)) %>%
map(set_names, c("h1", "h2")) %>%
map_dfr(bind_rows, .id = "id")
#> Error in add_row(.x, .y, .before = 1): object '.x' not found
A pointer to resolve this last step would be most appreciated.
Not quite sure how to do this via purrr map functions, but here is an alternative,
library(dplyr)
bind_rows(lapply(lst, function(i){d1 <- as.data.frame(matrix(names(i), ncol = ncol(i)));
rbind(d1, setNames(i, names(d1)))}), .id = 'id')
# id V1 V2
#1 df1 X.1 heading
#2 df1 1 a
#3 df1 2 b
#4 df2 X.32 another.topic
#5 df2 3 Line 1
#6 df2 4 Line 2
Here's an approach using map, rbindlist from data.table and some base R functions:
library(purrr)
library(dplyr)
library(data.table)
map(lst, ~ as.data.frame(unname(rbind(colnames(.x),as.matrix(.x))))) %>%
rbindlist(idcol = "id")
# id V1 V2
#1: df1 X.1 heading
#2: df1 1 a
#3: df1 2 b
#4: df2 X.32 another.topic
#5: df2 3 Line 1
#6: df2 4 Line 2
Alternatively we could use map_df if we use colnames<-:
map_df(lst, ~ as.data.frame(rbind(colnames(.x),as.matrix(.x))) %>%
`colnames<-`(.,paste0("h",seq(1,dim(.)[2]))), .id = "id")
# id h1 h2
#1 df1 X.1 heading
#2 df1 1 a
#3 df1 2 b
#4 df2 X.32 another.topic
#5 df2 3 Line 1
#6 df2 4 Line 2
Key things here are:
Use as.matrix to get rid of the factor / character incompatibility.
Remove names with unname or set them with colnames<-
Use the idcols = or .id = feature to get the names of the list as a column.
I altered your sample data a bit, setting stringsAsFactors to FALSE when creating the data.frames in lst.
here is a solution using data.table::rbindlist().
#sample data
lst <- list(df1 = data.frame(X.1 = as.character(1:2),
heading = letters[1:2],
stringsAsFactors = FALSE), # !! <--
df2 = data.frame(X.32 = as.character(3:4),
another.topic = paste("Line ", 1:2),
stringsAsFactors = FALSE) # !! <--
)
DT <- data.table::rbindlist( lapply( lst, function(x) rbind( names(x), x ) ),
use.names = FALSE, idcol = "id" )
setnames(DT, names( lst[[1]] ), c("h1", "h2") )
# id h1 h2
# 1: df1 X.1 heading
# 2: df1 1 a
# 3: df1 2 b
# 4: df2 X.32 another.topic
# 5: df2 3 Line 1
# 6: df2 4 Line 2
Given the following tibbles:
df1<- tibble(A = c(1:10), B=sample(c(21:30)))
df2<-tibble(A = c(1,2,4,6,7))
I want to create df3 which contains all the rows in which df1$A is found in df2$A. I've tried
df3<- df1 %>% filter(A == df2%A))
but this returns only 2 rows, because it is matching the rows, not searching for the values. My real data set is several thousand rows.
Thanks in advance!
library(tidyverse)
df1<- tibble(A = c(1:10), B=sample(c(21:30)))
df2<-tibble(A = c(1,2,4,6,7))
df1 %>%
filter(df1$A %in% df2$A)
The proper way to do this is to use a semi_join()
E.g.,
library(tidyverse)
set.seed(123)
df1 <- tibble(A = c(1:10), B = sample(c(21:30)))
df2 <- tibble(A = c(1, 2, 4, 6, 7))
df3 <- semi_join(df1, df2, by = "A")
df3
#> # A tibble: 5 x 2
#> A B
#> <int> <int>
#> 1 1 23
#> 2 2 30
#> 3 4 28
#> 4 6 29
#> 5 7 21
Created on 2020-05-06 by the reprex package (v0.3.0)