Find gender based on a specific dataframe - r

Having a dataframe with have the gender of specific names
dfgender <- data.frame(name = c("Helen","Erik"), gender = c("F","M"))
How is it possible to use the previous data frame in order to check the names of another column of a dataframe and insert "Neutral" if the name is not in the list of gender dataframe:
Example of the dataframe with the names:
dfnames <- data.frame(names = c("Helen", "Von", "Erik", "Brook"))
Example of expected output
dfnames <- data.frame(name = c("Helen", "Von", "Erik", "Brook"), gender = c("F", "Neutral", "M", "Neutral"))

left_join + replace_na should do:
dfnames %>% left_join(dfgender, by=c('names' = 'name')) %>%
mutate(gender = gender %>% as.character %>% replace_na('Neutral'))
# names gender
# 1 Helen F
# 2 Von Neutral
# 3 Erik M
# 4 Brook Neutral

The (experimental) rows_update could be an intuitive compliment to #Juan C's answer:
library(dplyr)
dfnames |>
mutate(gender = "Neutral") |>
rows_update(rename(dfgender, names = name), "names")
Output:
names gender
1 Helen F
2 Von Neutral
3 Erik M
4 Brook Neutral

Here's a solution similar to Juan C's but with, I think, a simpler replacement of NAs:
library(dplyr)
library(tidyr)
dfgender <- data.frame(name = c("Helen","Erik"), gender = c("F","M"))
dfnames <- data.frame(names = c("Helen", "Von", "Erik", "Brook"))
dfnames %>%
left_join(dfgender, by = c("names" = "name")) %>%
replace_na(list(gender = "Neutral"))
# names gender
# 1 Helen F
# 2 Von Neutral
# 3 Erik M
# 4 Brook Neutral
And here's another solution with no tidyr dependency:
library(dplyr)
dfgender <- data.frame(name = c("Helen","Erik"), gender = c("F","M"))
dfnames <- data.frame(names = c("Helen", "Von", "Erik", "Brook"))
dfnames %>%
left_join(dfgender, by = c("names" = "name")) %>%
mutate(gender = coalesce(gender, "Neutral"))
# names gender
# 1 Helen F
# 2 Von Neutral
# 3 Erik M
# 4 Brook Neutral

Related

Function over every column of list of dfs

I have the following dfs
df1 <- data.frame(name= c("mark", "peter", "lily"), col1= c(1,2,3),col2= c(4,5,6))
df2 <- data.frame(name= c("mark", "liam", "peter"), col1= c(7,8,9),col2= c(1,2,3))
df3 <- data.frame(name= c("felix", "liam", "peter"), col1= c(3,5,8),col2= c(1,5,8))
df4 <- data.frame(name= c("felix", "lily", "liam"), col1= c(6,2,6),col2= c(4,2,2))
df_list <- list(df1,df2,df3,df4)
I use this code for calculations on two consecutive dfs:
It matches the "rows" of two consecutive dfs by "name" and calcs a ratio of col1 between those. It then returns the results as a new column to the dfs.
df_combined <- df1 %>%
left_join(df2, by="name") %>%
mutate(combined=(col1.x/col1.y)) %>%
filter(!is.na(combined)) %>%
select(name,combined)
add_match_column<-function(df){
df %>% left_join(df_combined)
}
df_list_matched <- df_list %>%
map(add_match_column)
Now I want to iterate over all other columns with the same function. Thus, col2 of two consecutive dfs in the next step and so on (my dfs have 100+ columns and all dfs have the same structure).
It appears as though you have panel data, where your observations for each period are in separate data frames. Now you want to calculate for individual i in period t the ratio between x_it and x_i(t-1).
library(tidyverse)
# It is better to import all your df's into one list instead of separately assigning them
d <- list(
data.frame(name= c("mark", "peter", "lily"), col1= c(1,2,3),col2= c(4,5,6)),
data.frame(name= c("mark", "liam", "peter"), col1= c(7,8,9),col2= c(1,2,3)),
data.frame(name= c("felix", "liam", "peter"), col1= c(3,5,8),col2= c(1,5,8)),
data.frame(name= c("felix", "lily", "liam"), col1= c(6,2,6),col2= c(4,2,2))
)
d |>
bind_rows(.id = "t") |> # Bind to one long df
arrange(name, t) |>
complete(name, t) |> # add implicit NA's
group_by(name) |>
mutate(across(where(is.numeric), ~ .x/lag(.x))) |>
ungroup() |>
filter(!is.na(col1))
#> # A tibble: 6 × 4
#> name t col1 col2
#> <chr> <chr> <dbl> <dbl>
#> 1 felix 4 2 4
#> 2 liam 3 0.625 2.5
#> 3 liam 4 1.2 0.4
#> 4 mark 2 7 0.25
#> 5 peter 2 4.5 0.6
#> 6 peter 3 0.889 2.67
Created on 2022-01-18 by the reprex package (v2.0.1)

More efficient way to purrr::map2 for a large dataframe

Is there a faster way to do the following, where in the real application, df has many rows (and therefore list_of_colnames has the same number of elements):
list_of_colnames <- list(c("A", "B"), c("A"))
some_vector <- c("fish", "cat")
map2(split(df, seq(nrow(df))), list_of_colnames, function(row, colnames) {
row$indicator <- ifelse(any(row[, colnames] %in% some_vector), 1, 0)
return(row)
})
While this current implementation works, it takes centuries for the big df. In fact I think split() is a major bottleneck.
Thank you!
One option may be to make use of row/column indexing
rowind <- rep(seq_len(nrow(df)), lengths(list_of_colnames) * nrow(df))
df$indicator <- +(tapply(c(t(df[unlist(list_of_colnames)])) %in% some_vector,
rowind, FUN = any))
-output
> df
A B indicator
1 fish A 1
2 hello cat 1
data
df <- data.frame(A = c('fish', 'hello'), B = c('A', 'cat'))
You can avoid splitting your data frame into a list all together and instead apply your condition across the rows using rowwise and c_across from dplyr:
library(dplyr)
library(purrr)
list_of_colnames <- list(c("A", "B"), c("A"))
some_vector <- c("fish", "cat")
map(list_of_colnames, ~
df %>%
rowwise() %>%
mutate(indicator = as.numeric(any(c_across(all_of(.x)) %in% some_vector))) %>%
ungroup()
)
Output
Still mapping over list_of_columns returns a list output:
[[1]]
# A tibble: 3 x 4
A B C indicator
<chr> <chr> <chr> <lgl>
1 fish dog bird TRUE
2 dog cat bird TRUE
3 bird lion cat FALSE
[[2]]
# A tibble: 3 x 4
A B C indicator
<chr> <chr> <chr> <lgl>
1 fish dog bird TRUE
2 dog cat bird FALSE
3 bird lion cat FALSE
Data
structure(list(A = c("fish", "dog", "bird"), B = c("dog", "cat",
"lion"), C = c("bird", "bird", "cat")), class = "data.frame", row.names = c(NA,
-3L))

How to spread the count data

I have an example data like below.
site <- c("a", "b")
RankA <- c("3","1")
RankB <- c("1","3")
RankC <- c("0","1")
rawdata <- cbind(site, RankA, RankB, RankC)
I would like to transform like "newdata".
site <- c("a","a","a","a", "b","b","b","b","b")
Rank <- c("A","A","A","B","A","B","B","B","C")
newdata<- cbind(site,Rank)
Thanks,
#edit
rawdata is the result of an evaluation survey about the site. For each site, the number of evaluations at each rank (A to C) is recorded. For example,"site a" has 3 votes for RankA, 1 vote for RankB, and 0 votes for RankC. I want to convert this data into "newdata" where each evaluation is one row.
Try this, using dplyr and tidyr:
site <- c("a", "b")
RankA <- c("3","1")
RankB <- c("1","3")
RankC <- c("0","1")
df <- data.frame(site, A = RankA, B = RankB, C = RankC)
df <- tidyr::pivot_longer(df, cols = 2:4, values_to = 'rep', names_to = 'rank')
df <- df[rep(1:nrow(df), df$rep),] %>%
select(-rep)
df
You can use uncount from tidyr -
library(dplyr)
library(tidyr)
rawdata <- data.frame(site = c("a", "b"), RankA = c(3,1),
RankB = c(1,3), RankC = c(0,1))
rawdata %>% pivot_longer(cols = -site) %>% uncount(value)
# site name
# <chr> <chr>
#1 a RankA
#2 a RankA
#3 a RankA
#4 a RankB
#5 b RankA
#6 b RankB
#7 b RankB
#8 b RankB
#9 b RankC

How to update rownames in R?

I have 3 data frames with three variables each and the name of the player
a <- rnorm(16, 3, 2)
b <- rnorm(16, 1, 3)
c <- rpois(16, 3)
year <- c(rep(2015, 5), rep(2016, 5), rep(2017, 6))
player <- c("Alex", "CT", "Bill", "Brian", "Collin", "Chez", "Adam", "Danny III", "Lee", "Chris",
"Erik", "Axel", "Louis", "Justin", "Dustin", "Johnson")
df_1 <- data.frame(player, year, a, b, c)
d <- rnorm(16, 3, 2)
e <- rnorm(16, 1, 3)
f <- rpois(16, 3)
year <- c(rep(2015, 5), rep(2016, 5), rep(2017, 6))
player <- c("Alexander", "C.T.", "Bill", "Brian", "Collin", "Chez", "Adam", "Danny IV", "Lee", "Chris",
"Erik", "Axel", "Louis", "Justin", "Dustin", "Johnson")
df_2 <- data.frame(player, year, d, e, f)
g <- rnorm(16, 3, 2)
h <- rnorm(16, 1, 3)
i <- rpois(16, 3)
year <- c(rep(2015, 5), rep(2016, 5), rep(2017, 6))
player <- c("Alex", "CT", "Bill", "Brian", "Collin", "Chez", "Adam", "Danny III", "Lee", "Chris",
"Erik", "Axel", "Louis", "Justin", "Dustin", "Johnson")
df_3 <- data.frame(player, year, g, h, i)
This data frame contains the name of the player corresponding to each data set of variables.
For example, Alex is the same as Alexander in variables from d to f, and it is the same as Alex in observations from g to i. Danny III is named Danny IV in variables from d to f and it is named Danny III in variables from g to i.
a_to_c <- c("Alex", "CT", "Danny III")
d_to_f <- c("Alexander", "C.T.", "Danny IV")
g_to_i <- c("Alex", "CT", "Danny III")
names_palyer <- data.frame(a_to_c, d_to_f, g_to_i)
I want to merge the three data frames by year and player into a single data frame. I need to use the information from "names_player" to correctly match the player with the data
I did this example for simplicity, in reality, I have thousand of observations so I need to find a way to automatically match the player's name so I can have a single data frame with the information of the three data frames.
Initialize the output ('out') as the first data ('df_1'). Loop over the index of columns of 'names_palyer' (excluding the last column), get the value of 'df_' object corresponding (incrementing 1 - i + 1 - assuming objects are named as df_1, df_2 etc.), then select a subset of columns of 'names_palyer' (keydat), use match to get the index of matching values with 'player' column of 'tmp' data. Replace the 'player' to the first column values of 'keydat' based on index, then do the merge (left join - all.x = TRUE), and at end, change the output 'player' that match to keydat' column to second column values of 'keydat' (so that it would be useful for the next iteration)
out <- df_1
for(i in 1:(ncol(names_palyer)-1)) {
tmp <- data.table::copy(get(paste0('df_', i + 1)))
keydat <- names_palyer[c(i, i + 1)]
keydat <- keydat[keydat[[2]] %in% tmp$player,, drop = FALSE]
i1 <- match(keydat[[2]], tmp$player, nomatch = 0)
tmp$player[i1] <- keydat[[1]]
print(tmp)
out <- merge(out, tmp, by = c('player', 'year'), all.x = TRUE)
i2 <- match(keydat[[1]], out$player, nomatch = 0)
out$player[i2] <- keydat[[2]]
}
-output
out
player year a b c d e f g h i
1 Adam 2016 0.03587367 -0.57907496 3 5.1149009 2.47064240 2 2.3325348 0.62526907 6
2 Alex 2015 1.27778013 0.05809471 0 4.1932959 4.37934704 0 4.3226737 -0.33523019 5
3 Axel 2017 2.56466723 0.43108713 2 5.9970138 -2.19947169 4 0.9717511 2.05843957 3
4 Bill 2015 2.05594607 3.96167974 3 2.5232810 3.87191286 3 3.1726895 3.43683108 0
5 Brian 2015 3.44690732 0.35032810 4 4.7287671 0.08108714 2 2.8519495 -0.08249603 2
6 CT 2015 5.85679299 -1.57623304 2 3.9653678 1.68389034 3 3.0328709 1.04315644 2
7 Chez 2016 0.73604605 -2.58101736 1 4.0642894 0.04941299 3 5.4688474 -1.82831432 3
8 Chris 2016 0.95621081 2.05206411 4 2.7249987 2.42911270 8 -0.4515070 -2.12097504 0
9 Collin 2015 7.14194691 0.74030236 5 4.7879545 5.41397214 4 1.4835656 0.92897125 2
10 Danny III 2016 4.59832890 0.60355092 5 4.4822495 4.15865653 0 2.4950848 3.31059942 3
11 Dustin 2017 0.26640646 -0.23381080 4 5.3164916 3.67001803 1 0.7011976 2.59135173 4
12 Erik 2017 0.27363760 -4.50110125 3 4.9495033 3.31417537 3 4.1907692 5.57914934 6
13 Johnson 2017 7.12013083 2.52775367 3 1.9192381 4.33916287 2 3.3836699 -2.37444447 3
14 Justin 2017 3.41710305 -3.82843506 4 5.5590782 0.56030426 1 0.1670448 5.99934712 6
15 Lee 2016 -1.02002976 -3.24576311 4 0.9538381 -0.91783716 5 2.5668076 -0.67247680 2
16 Louis 2017 1.94420093 0.47369179 3 2.8249960 -1.28630731 7 3.0070664 1.25132019 5
With the OP's new data
out <- copy(df_1)
for(i in 1:(ncol(names_palyer)-1)) {
tmp <- data.table::copy(get(paste0('df_', i + 1)))
keydat <- names_palyer[c(i, i + 1)]
keydat <- keydat[keydat[[2]] %in% tmp$player,, drop = FALSE]
i1 <- match(keydat[[2]], tmp$player, nomatch = 0)
tmp$player[i1] <- keydat[[1]]
print(tmp)
out <- merge(out, tmp, by = c('player', 'year'), all.x = TRUE)
i2 <- match(keydat[[1]], out$player, nomatch = 0)
out$player[i2] <- keydat[[2]][keydat[[1]] %in% out$player]
}
library(dplyr)
library(purrr)
split.default(out[-(1:2)], sub("\\..*", "", names(out)[-(1:2)])) %>%
map_dfc(reduce, coalesce) %>%
bind_cols( out[1:2], .)

Assign list name to dataframe list element

I have a list of lists of dataframes:
library(dplyr)
library(magrittr)
a <- list(first = data.frame(x=runif(1), y=runif(1)),
second = data.frame(x=runif(5), y=runif(5)))
b <- list(first = data.frame(x=runif(1), y=runif(1)),
second = data.frame(x=runif(5), y=runif(5)))
a <- a %>% set_names(1:length(a))
b <- b %>% set_names(1:length(b))
c <- list(a, b)
c <- c %>% set_names(1:length(c))
I want to assign the two levels of list names as new columns to the dataframe, and then bind them into one dataframe. The desired output is something like:
x y name1 name2
.23 .43 1 1
.23 .43 1 2
.23 .43 2 1
.23 .43 2 2
Where the values of x and y are not the point. I am struggling with this as lapply does not access the name of the element of the list.
Thanks.
May be this helps:
library(reshape2)
library(tidyr)
library(dplyr)
res <- melt(c) %>%
group_by(variable) %>%
mutate(indx=row_number()) %>%
spread(variable, value) %>%
ungroup() %>%
select(-indx)

Resources