Function that ignores missing columns - r

Say I have the following two data frames:
col1 <- c("a","b","c","d","e")
col2 <- c("A","B","C","D","E")
col1a <- c("a","b","c","d","e")
col2a <- c("A","B","C","D","E")
df1 <- data.frame(col1, col2)
df2 <- data.frame(col1a, col2a)
colnames(df1) <- c("c1","c2")
colnames(df2) <- c("c1","c3")
And I have the following function to rename column headers:
library(dplyr)
col_rename <- function(x) x %>% rename(new_c1 = c1, new_c2 = c2, new_c3 = c3)
When I run this function, I get an error because the columns in the function does not match the columns in the data frame.
df1 <- col_rename(df1)
Error: `c3` contains unknown variables
How can I make the function run only on the present columns, and ignore the ones not present, without removing or changing the column names specified in the function?
EDIT:
I can see how the example was a bit confusing. I have many dataframes with many columns. These columns are shared by some dataframes but not all. However, I want to rename all columns specified by the function, regardless of what is present in the dataframe. It looks something like this:
col1 <- c(1:5)
col2 <- c(1:5)
col3 <- c(1:5)
col4 <- c(1:5)
df1 <- data.frame(col1,col2,col3,col4)
df2 <- data.frame(col1,col2,col3,col4)
colnames(df1) <- c("c1","c2","c6","c8")
colnames(df2) <- c("c1","c3","c2","c8")
AB_rename <- function(x) x %>% rename(aa=col1,bb=col2,
cc=col3,dd=col4,
ee=col5,ff=col6,
gg=col7,hh=col8)
Therefore I cannot follow the example of #Ycw, as they do not all follow the same rename rule. How do I make this ignore columns that are not present?

Here is a workaround to use setNames for the col_rename function.
col_rename <- function(x) setNames(x, paste0("new_", names(x)))
col_rename(df1)
new_c1 new_c2
1 a A
2 b B
3 c C
4 d D
5 e E
col_rename(df2)
new_c1 new_c3
1 a A
2 b B
3 c C
4 d D
5 e E
Or use the select_all function from the dplyr.
library(dplyr)
df1 %>% select_all(function(x) paste0("new_", x))
new_c1 new_c2
1 a A
2 b B
3 c C
4 d D
5 e E
This (~) also works for select_all
df2 %>% select_all(~paste0("new_", .))
new_c1 new_c3
1 a A
2 b B
3 c C
4 d D
5 e E
rename_all also works well
library(dplyr)
df1 %>% rename_all(~paste0("new_", .))
new_c1 new_c2
1 a A
2 b B
3 c C
4 d D
5 e E
Update
This is an update to address OP's updated question.
We can create a named vector showing the relationship between old column names and new column names. And defined a function to change the name based on the setNames function.
# Create name vector
vec <- paste0("c", 1:8)
names(vec) <- c("aa", "bb", "cc", "dd", "ee", "ff", "gg", "hh")
# Create the function
AB_rename <- function(x, name_vec){
old_colname <- names(x)
new_colname <- name_vec[name_vec %in% old_colname]
x2 <- setNames(x, names(new_colname))
return(x2)
}
AB_rename(df1, vec)
aa bb ff hh
1 1 1 1 1
2 2 2 2 2
3 3 3 3 3
4 4 4 4 4
5 5 5 5 5

Related

How can I fill NA-values in a data frame column based on the values from an other column? [duplicate]

This question already has an answer here:
Replace NA with mode based on ID attribute
(1 answer)
Closed 2 years ago.
I'd like to fill the NA-values in F2-column, based on the the most common F2-value when grouped by F1-column.
F1 F2
1 A C
2 B D
3 A NA
4 A C
5 B NA
Desired outcome:
F1 F2
1 A C
2 B D
3 A C
4 A C
5 B D
Thank you for help
Here is a base R solution. First define a function for Mode (Taken from here) and then apply it to you data frame, i.e.
Mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
df$F2 <- with(df, ave(F2, F1, FUN = function(i) replace(i, is.na(i), Mode(i))))
df
# F1 F2
#1 A C
#2 B D
#3 A C
#4 A C
#5 B D
Here is one way using dplyr :
library(dplyr)
df %>%
group_by(F1) %>%
mutate(F2 = replace(F2, is.na(F2),
names(sort(table(F2), decreasing = TRUE)[1])))
# F1 F2
# <chr> <chr>
#1 A C
#2 B D
#3 A C
#4 A C
#5 B D
In case of ties, preference is given to lexicographic order.
Try this:
First in df2 I get max count by the variable F1 where F2 is not missing. That will give you the most common F2 value when groups by F1. I join it back onto the original data.frame and use a mutate to fill by the new variable F2_fill and then remove it from this variable from the data.frame.
library(tidyverse)
df <- tribble(
~F1, ~F2,
'A', 'C',
'B' , 'D',
'A' ,NA,
'A', 'C',
'B', NA)
df2 <- df %>%
group_by(F1) %>%
count(F2) %>%
filter(!is.na(F2), n == max(n)) %>%
select(-n) %>%
rename(F2_fill = F2)
df3 <- left_join(df,df2, by="F1") %>%
mutate(F2 = ifelse(is.na(F2), F2_fill,F2)) %>%
select(-F2_fill)
You can use ave with table and which.max and subsetting with is.na when it is a character.
i <- is.na(x$F2)
x$F2[i] <- ave(x$F2, x$F1, FUN=function(y) names(which.max(table(y))))[i]
x
# F1 F2
#1 A C
#2 B D
#3 A C
#4 A C
#5 B D
Data:
x <- data.frame(F1 = c("A", "B", "A", "A", "B")
, F2 = c("C", "D", NA, "C", NA))

selecting values of one dataframe based on partial string in another dataframe

I have two dataframes (DF1 and DF2)
DF1 <- as.data.frame(c("A, B","C","A","C, D"))
names(DF1) <- c("parties")
DF1
parties
A, B
C
A
C, D
.
B <- as.data.frame(c(LETTERS[1:10]))
C <- as.data.frame(1:10)
DF2 <- bind_cols(B,C)
names(DF2) <- c("party","party.number")
.
DF2
party party.number
A 1
B 2
C 3
D 4
E 5
F 6
G 7
H 8
I 9
J 10
The desired result should be an additional column in DF1 which contains the party numbers taken from DF2 for each row in DF1.
Desired result (based on DF1):
parties party.numbers
A, B 1, 2
C 3
A 1
C, D 3, 4
I strongly suspect that the answer involves something like str_match(DF1$parties, DF2$party.number) or a similar regular expression, but I can't figure out how to put two (or more) party numbers into the same row (DF2$party.numbers).
One option is gsubfn by matching the pattern as upper-case letter, as replacement use a key/value list
library(gsubfn)
DF1$party.numbers <- gsubfn("[A-Z]", setNames(as.list(DF2$party.number),
DF2$party), as.character(DF1$parties))
DF1
# parties party.numbers
#1 A, B 1, 2
#2 C 3
#3 A 1
#4 C, D 3, 4
An alternative solution using tidyverse. You can reshape DF1 to have one string per row, then join DF2 and then reshape back to your initial form:
library(tidyverse)
DF1 <- as.data.frame(c("A, B","C","A","C, D"))
names(DF1) <- c("parties")
B <- as.data.frame(c(LETTERS[1:10]))
C <- as.data.frame(1:10)
DF2 <- bind_cols(B,C)
names(DF2) <- c("party","party.number")
DF1 %>%
group_by(id = row_number()) %>%
separate_rows(parties) %>%
left_join(DF2, by=c("parties"="party")) %>%
summarise(parties = paste(parties, collapse = ", "),
party.numbers = paste(party.number, collapse = ", ")) %>%
select(-id)
# # A tibble: 4 x 2
# parties party.numbers
# <chr> <chr>
# 1 A, B 1, 2
# 2 C 3
# 3 A 1
# 4 C, D 3, 4

parent child structure in R dataframe

I have a csv that contains an org structure as follows plus some additional columns. I use R to create charts and it works great !.
The challenge is when trying to create the charts for a subset manager and its children/grandchildren.
Is there any filtering that is possible in dplr or any alternative package?
Sample format:
emp_id mgr_id nest_id
A A 0
B A 1
C B 2
D C 3
D1 D 4
D2 D 4
E C 3
E1 E 4
F C 3
G B 2
H G 3
The subset I need is for manager "C"
Scenario 1:emp_id==C should contain all nodes of 'D','D1','D2','E','E1','F'
expected structure:
manager,all_children
C D
C D1
C D2
C E
C E1
C F
Scenario 2:emp_id==C should contain all above nodes but retain mgr_id structure for 'D','E'
expected structure:
manager,all_children
C D
C E
C F
D D1
D D2
E E1
Consider the base package with by which creates a df list for every level of mgr_id (not just C):
SCENARIO 1
dfList <- by(df, df$mgr_id, function(i){
names(i) <- paste0(names(i), "_") # SUFFIX UNDERSCORE (TO AVOID DUP COLUMNS)
child <- merge(i, df, by.x="mgr_id_", by.y="emp_id")[,1:2]
grandchild <- merge(child, df, by.x="emp_id_", by.y="mgr_id")[c("mgr_id_", "emp_id")]
names(child) <- gsub("*_$", "", names(child)) # REMOVE LAST UNDERSCORE
names(grandchild) <- gsub("*_$", "", names(grandchild)) # REMOVE LAST UNDERSCORE
rbind(child, grandchild)
})
dfList$C
# mgr_id emp_id
# 1 C D
# 2 C E
# 3 C F
# 4 C D1
# 5 C D2
# 6 C E1
SCENARIO 2 (where the selected columns change in grandchild and then first column rename)
dfList <- by(df, df$mgr_id, function(i){
names(i) <- paste0(names(i), "_") # SUFFIX UNDERSCORE (TO AVOID DUP COLUMNS)
child <- merge(i, df, by.x="mgr_id_", by.y="emp_id")[,1:2]
grandchild <- merge(child, df, by.x="emp_id_", by.y="mgr_id")[c("emp_id_", "emp_id")]
names(child) <- gsub("*_$", "", names(child)) # REMOVE LAST UNDERSCORE
names(grandchild) <- gsub(".*_$", "", names(grandchild)) # REMOVE LAST UNDERSCORE
names(grandchild)[1] <- "mgr_id"
rbind(child, grandchild)
})
dfList$C
# mgr_id emp_id
# 1 C D
# 2 C E
# 3 C F
# 4 D D1
# 5 D D2
# 6 E E1
Here is one solution using functions from dplyr and data.table. dt3 is the output for scenario 1, while dt4 is the output for scenario 2.
# Load packages
library(dplyr)
library(data.table)
# Create example data frame
dt <- read.table(text = "emp_id mgr_id nest_id
A A 0
B A 1
C B 2
D C 3
D1 D 4
D2 D 4
E C 3
E1 E 4
F C 3
G B 2
H G 3",
header = TRUE, stringsAsFactors = FALSE)
# Process the data
dt2 <- dt %>%
# Filter levels lower than 1
filter(nest_id > 1) %>%
mutate(group_id = ifelse(nest_id > 2, 0, 1)) %>%
# Create "run_id", which will be used to fill manager label
mutate(run_id = rleid(group_id)) %>%
mutate(run_id = ifelse(run_id %% 2 == 0, run_id - 1, run_id)) %>%
group_by(run_id) %>%
mutate(manager = first(emp_id)) %>%
# Select for manager C
filter(manager %in% "C") %>%
ungroup() %>%
# Remove rows if manager == emp_id
filter(manager != emp_id) %>%
rename(all_children = emp_id)
# Scenario 1
dt3 <- dt2 %>% select(manager, all_children)
# Scenario 2
dt4 <- dt2 %>%
select(manager = mgr_id, all_children) %>%
arrange(manager, all_children)

Grouping low occuring levels in a dataframe in R

Suppose that I have a data frame that has a column called C. C has many levels that only occur once. How would I rename all of the levels that occur only once with a new level (called z)?
A B C
a a a
a b b
a a c
a b d
a b a
The above would turn into:
A B C
a a a
a b z
a a z
a b z
a b a
What about this (assuming your data is df)?
levels(df[,3])[table(df[,3])==1] <- "z"
df
A B C
1 a a a
2 a b z
3 a a z
4 a b z
5 a b a
I'm sure there is a more elegant way to do this but here is one solution:
df <- read.table(text = "A B C
a a a
a b b
a a c
a b d
a b a", header = TRUE)
# Get the number of times each factor occurs:
counts <- table(df$C)
# Replace each one that only occurs once with "z"
df$C <- ifelse(df$C %in% names(counts[counts == 1]), "z", as.character(df$C))
# Since the levels changed, encode as a factor again:
df$C <- factor(df$C)
This gives:
R> df$C
[1] a z z z a
Levels: a z
using dplyr:
library(dplyr)
df %>% group_by(C) %>%
mutate(D = as.character(ifelse(n() == 1, "z", as.character(C))))
There is some ugly stuff to deal with the ifelse in there.

Aggregated sum of another column over a vector of names in a data.frame

I have the following data.frame:
> DF <- data.frame(names = I(list(c("a", "b", "c"), c("a"), c("c", "d"))),
counts = c(1, 2, 3))
> DF
names counts
1 a, b, c 1
2 a 2
3 c, d 3
How do I get a result that sums up the total counts of each name?
Something like:
name sum
a 3
b 1
c 4
d 3
Try
DF1 <- data.frame(name=unlist(DF$names),
val=rep(DF$counts,sapply(DF$names, length)))
Or
DF1 <- do.call(rbind,Map(data.frame, name=DF$names, val=DF$counts))
aggregate(val~name, DF1, sum)
# name val
#1 a 3
#2 b 1
#3 c 4
#4 d 3
Or
DF2 <- transform(stack(setNames(DF$names, DF$counts)),
ind=as.numeric(as.character(ind)))
aggregate(ind~values, DF2, sum)

Resources