Add names of dataframes as columns - r

I would like to combine multiple dataframes but before that I'd like to add the name of the dataframe as character string in each entry of a new column. I'm almost there but don't see the problem. Code:
df1 <- data.frame("X1"=c(1,1),"X2"=c(1,1))
df2 <- data.frame("X1"=c(2,2),"X2"=c(2,2))
df3 <- data.frame("X1"=c(3,3),"X2"=c(3,3))
addCol <- function(df){df$newCol <- deparse(substitute(df)); df}
# Extracts name of dataframe and writes it into entries of newCol
alldfsList <- lapply(list(df1,df2,df3), function(df) x <- addCol(df))
# Should apply addCol function to all dataframes, generates a list of lists
alldfs <- do.call(rbind, alldfsList) # Converts list of lists into dataframe
The problem is that the second command doesn't write the name of the dataframe into the column entries, but the placeholder, "df". But when I apply the addCol function manually to a single dataframe, it works. Can you help? Thanks!
Output:
> alldfs
X1 X2 newCol
1 1 1 df
2 1 1 df
3 2 2 df
4 2 2 df
5 3 3 df
6 3 3 df
>
Function applied to a single df works:
> addCol(df1)
X1 X2 newCol
1 1 1 df1
2 1 1 df1
>

The easiest would be to use dplyr::bind_rows
library(dplyr)
bind_rows(lst(df1,df2,df3),.id="newCol")
# newCol X1 X2
# 1 df1 1 1
# 2 df1 1 1
# 3 df2 2 2
# 4 df2 2 2
# 5 df3 3 3
# 6 df3 3 3

Moody_Mudskipper answer is a better solution, this is just so you understand what's happening with your code.
From the substitute help page:
substitute returns the parse tree for the (unevaluated) expression expr, substituting any variables bound in env
When you run addCol inside a function in lapply, substitute gets the name from that environment. Look what happens when you change the syntax in lapply:
> lapply(list(df1,df2,df3), function(x) x <- addCol(x))
[[1]]
X1 X2 newCol
1 1 1 x
2 1 1 x
[[2]]
X1 X2 newCol
1 2 2 x
2 2 2 x
[[3]]
X1 X2 newCol
1 3 3 x
2 3 3 x
What you need is to use a different method to get the object name. Or change the code so the function have the name as input. Here's an example:
addCol <- function(df.name) {
dataf <- get(df.name)
dataf$newCol <- df.name
return(dataf)
}
> do.call(rbind, lapply(ls(pattern='df'), addCol))
X1 X2 newCol
1 1 1 df1
2 1 1 df1
3 2 2 df2
4 2 2 df2
5 3 3 df3
6 3 3 df3

Related

R Compare duplicate values ​for each row in two data sets + in any order

I want to compare whether the 'values set' ​​in each row are the same.
In this case, duplicated and all_equal function are not suitable.
Reproducible Sample Data
a=c(1,1)
b=c(2,2)
c=c(3,3)
d=c(4,5)
df1<-rbind(a,b,c)
df1<-df1 %>% as.data.frame()
df2<-rbind(a,d,b)
df2<-df2 %>% as.data.frame()
> df1
V1 V2
a 1 1
b 2 2
c 3 3
> df2
V1 V2
a 1 1
d 4 5
b 2 2
Expected output
df1$idx1 <- 1:nrow(df1)
df2$idx2 <- 1:nrow(df2)
df1
df2
df3<-full_join(df1,df2,by=c('V1','V2'))
df3
df3$need <- ifelse(is.na(df3$idx2), 'only_df1',
ifelse(is.na(df3$idx1), 'only_df2',
'duplicated'))
> df3
V1 V2 idx1 idx2 need
1 1 1 1 1 duplicated
2 2 2 2 3 duplicated
3 3 3 3 NA only_df1
4 4 5 NA 2 only_df2
I try... but This is complicated.
I think there must be a better way. help!
Since you are already using dplyr, you may use case_when which is easier to understand and write especially when you have lot of conditions.
library(dplyr)
full_join(df1,df2,by=c('V1','V2')) %>%
mutate(need = case_when(is.na(idx2) ~ 'only_df1',
is.na(idx1) ~ 'only_df2',
TRUE ~ 'duplicated'))
# V1 V2 idx1 idx2 need
#1 1 1 1 1 duplicated
#2 2 2 2 3 duplicated
#3 3 3 3 NA only_df1
#4 4 5 NA 2 only_df2
As already mentioned in the comments, your way looks ok. In case you want to see how it could be done in base:
a <- c(1,1)
b <- c(2,2)
c <- c(3,3) #Better don't use existing function names
d <- c(4,5)
df1 <- as.data.frame(rbind(a,b,c))
df2 <- as.data.frame(rbind(a,d,b))
df1$idx1 <- seq_len(nrow(df1)) #seq_len will also work in case nrow=0
df2$idx2 <- seq_len(nrow(df2))
df3 <- merge(df1, df2, all=TRUE)
df3$need <- ifelse(is.na(df3$idx2), "only_df1",
ifelse(is.na(df3$idx1), "only_df2",
"duplicated"))
df3
# V1 V2 idx1 idx2 need
#1 1 1 1 1 duplicated
#2 2 2 2 3 duplicated
#3 3 3 3 NA only_df1
#4 4 5 NA 2 only_df2
We can use
library(arsenal)
summary(comparedf(df1, df2))

Row wise count of Zeros' and NA in R for Columns

I want to find count of rows with respect to number of Zero's and NA's in the data frame , for example
number of rows having zeros in only 1 column etc..
code for the df is below and need to find for columns from M1 to M5
O/P needed for Zeros and NA , link provided below for desired O/P
https://imgur.com/y9qeyhV
id <- 1:9
M1 <- c(0,NA,1,0,0,NA,NA,1,7)
M2 <- c(NA,NA,0,NA,0,NA,NA,1,7)
M3 <- c(1,NA,0,0,0,1,NA,1,7)
M4 <- c(0,NA,0,3,0,NA,NA,1,7)
M5 <- c(5,0,0,NA,0,0,NA,0,NA)
data <- cbind(id,M1,M2,M3,M4,M5)
data <- as.data.frame(data)
Desired Output:
Try this
table(rowSums(is.na(data)))
# 0 1 2 3 4 5
# 3 2 1 1 1 1
table(factor(rowSums(data == 0, na.rm = T), levels = 0:5))
# 0 1 2 3 4 5
# 2 3 2 0 1 1
You can also pass the codes above to data.frame() or as.data.frame() to get an data.frame object like your expected output shows.
For NA:
data.frame(table(rowSums(is.na(data[startsWith(names(data),"M")]))))
Var1 Freq
1 0 3
2 1 2
3 2 1
4 3 1
5 4 1
6 5 1
For zeros
data.frame(table(factor(rowSums(0==data[startsWith(names(data),"M")],TRUE),0:5)))
Var1 Freq
1 0 2
2 1 3
3 2 2
4 3 0
5 4 1
6 5 1
apply(data, 1, function(x) length(x[is.na(x)]))
This will give you a vector. Each element corresponds to a row and its value is the number of NA elements in that row.
My solution is kind of complicated, but it gives the desired output using apply functions:
myFun <- function(data, count, fun) {
applyFun <- function(x) {
length(which(
apply(data, 1, function(y) length(which(fun(y))) == x)
))
}
sapply(count, applyFun)
}
myFun(data, 0:5, is.na)
myFun(data, 0:5, function(x) x == 0)
(You made a mistake in your example: two rows have no zeroes in any column: rows 7 and 9.)
Here is a for loop option to count NAs and Zeros in each row and then use dplyr::count to summarize the frequency of each value.
data$CountNA<-NA
for (i in 1:nrow(data)){
data[i,"CountNA"]<-length(which(is.na(data[i,1:(ncol(data)-1)])))}
count(data, CountNA)
data$CountZero<-NA
for (i in 1:nrow(data)){
data[i,"CountZero"]<-length(which((data[i,1:(ncol(data)-2)]==0)))}
count(data, CountZero)

filter() or subset() all the dataframes stored in a list

If I want to remove all the rows that contain 0s in a specific column, I can just do:
df <- data.frame(a = c(0,1,2,3,0,5),
b = c(1,2,3,5,3,1))
df <- filter(df, a != 0)
How can I do the same if I'm working with lists?
My intuition tells me to use 'lapply' but I cannot seem to make the syntax work:
#same dataframe.
df <- data.frame(a = c(0,1,2,3,0,5),
b = c(1,2,3,5,3,1))
df2 <- df
list.df <- list (df, df2)
lapply(list.df, filter(), a !=0) #don't work. How do I fix this syntax?
Many thanks in advance!
One option involving purrr could be:
map(.x = list.df, ~ .x %>%
filter(a != 0))
[[1]]
a b
1 1 2
2 2 3
3 3 5
4 5 1
[[2]]
a b
1 1 2
2 2 3
3 3 5
4 5 1
You have other options using lapply as:
#Without dplyr
lapply(list.df, function(x)x["a"!=0,])
#With dplyr
library(dplyr)
lapply(list.df, function(x)filter(x,a!=0))
# Result
# [[1]]
# a b
# 1 1 2
# 2 2 3
# 3 3 5
# 4 5 1
#
# [[2]]
# a b
# 1 1 2
# 2 2 3
# 3 3 5
# 4 5 1

variable names in for loop

x_names <-c("x1","x2","x3")
data <- c(1,2,3,4)
fake <- c(2,3,4,5)
for (i in x_names)
{
x = fake
data = as.data.frame(cbind(data,x))
#data <- data %>% rename(x_names = x)
}
I made a toy example. This code will generate a data frame with 1 column called data, and 3 columns called x. Instead of calling the columns x, I want them with the name x1, x2, x3 (stored in x_names). I put the x_name in the code (comment out), but it does not work. Could you help me with it?
We can also use map_dfc from tidyverse:
library(tidyverse)
cbind(data, map_dfc(x_names, ~ tibble(!!.x := fake)))
Output:
data x1 x2 x3
1 1 2 2 2
2 2 3 3 3
3 3 4 4 4
4 4 5 5 5
We can avoid the for loop and use replicate to repeat fake data using setNames to name the dataframe with x_names.
cbind(data, setNames(data.frame(replicate(length(x_names), fake)), x_names))
# data x1 x2 x3
#1 1 2 2 2
#2 2 3 3 3
#3 3 4 4 4
#4 4 5 5 5
Ideally one should avoid growing objects in a loop, however one way to solve OP's problem in loop is
for (i in seq_along(x_names)) {
data = cbind.data.frame(data, fake)
names(data)[i + 1] <- x_names[i]
}
An option is just to assign the 'fake' to create the new columns in base R
data[x_names] <- fake
data
# data x1 x2 x3
#1 1 2 2 2
#2 2 3 3 3
#3 3 4 4 4
#4 4 5 5 5
EDIT: Based on comments from #avid_useR
data
data <- data.frame(data)
When you exchange your out-commented line
#data <- data %>% rename(x_names = x)
with
colnames(data)[ncol(data)] <- i
it should set the right colnames.

Subset using loop over data frame in R

I have a dataframe which has 50 variables for values 1-5, but some of them contains values more than 5 like 18656, I need to remove all these values from the dataframe. Is there a function which can do this.
I am using this code
func <- function(df_likert, col){
df_likert <- subset(df_likert, col <= 5)
}
for (i in names(df_likert)) {
func(df_likert, i)
}
library(dplyr)
# example dataset
dt = data.frame(x1 = c(1,2,3,4,5),
x2 = c(3,3,4,5,10),
x3 = c(10,1,1,2,3))
# original dataset
dt
# x1 x2 x3
# 1 1 3 10
# 2 2 3 1
# 3 3 4 1
# 4 4 5 2
# 5 5 10 3
# update dataset
dt %>%
mutate_all(function(x) ifelse(x > 5, NA, x)) %>%
na.omit()
# x1 x2 x3
# 2 2 3 1
# 3 3 4 1
# 4 4 5 2
This solution removes all rows with values more than 5, as you mentioned. If you exclude the na.omit part you can just replace those values with NA instead of removing the whole row.

Resources