create column based on last non missing value of other column - r

very similar to this questions I try to populate a new variable by finding the last non missing value by group for an existing variable in a dataframe, ideally using dplyr/zoo. I want to only keep the last value though, and not merely overwrite missings, consider the following minimal example:
df1 <- data.frame(ID = c(1, 1, 1, 2, 2,2),
date = c(1,2,3,1,2,3),
var1 = c('a', '', 'b', '','c', ''))
df2 = ## R-commands to get:
df2 <- data.frame(ID = c(1, 1, 1, 2, 2,2),
date = c(1,2,3,1,2,3),
var1 = c('b', 'b', 'b', 'c','c', 'c'))

Using dplyr,
library(dplyr)
df1 %>%
group_by(ID) %>%
mutate(var1 = last(var1[var1 != '']))
which gives,
# A tibble: 6 x 3
# Groups: ID [2]
ID date var1
<dbl> <dbl> <fct>
1 1 1 b
2 1 2 b
3 1 3 b
4 2 1 c
5 2 2 c
6 2 3 c

Here is one option with base R using ave
df1$var1 <- with(df1, ave(as.character(var1), ID, FUN =
function(x) tail(x[nzchar(x)], 1)))
df1$var1
#[1] "b" "b" "b" "c" "c" "c"

Related

Creating a function to remove columns with different names from a list of dataframes

I have many dataframes that contain the same data, except for a few column differences between them that I want to remove. Here's something similar to what I have:
df1 <- data.frame(X = c(1, 2, 3, 4, 5),
var1 = c('a', 'b', 'c', 'd', 'e'),
var2 = c(1, 1, 0, 0, 1))
df2 <- data.frame(X..x = c(1, 2, 3, 4, 5),
X..y = c(1, 2, 3, 4, 5),
var1 = c('f', 'g', 'h', 'i', 'j'),
var2 = c(0, 1, 0, 1, 1))
df_list <- list(df1=df1,df2=df2)
I am trying to create a function to remove the X, X..x, and X..y columns from each of the dataframes. Here's what I've tried with the given error:
remove_col <- function(df){
df = subset(df, select = -c(X, X..x, X..y))
return(df)
}
df_list <- lapply(df_list, remove_col)
# Error in eval(substitute(select), nl, parent.frame()) :
# object 'X..x' not found
I'm running into problems because not all dataframes contain X, and similarly not all dataframes contain X..x and X..y. How can I update the function so that it can be applied to all dataframes in the list and successfully remove its given columns?
Using R version 3.5.1, Mac OS X 10.13.6
You can try:
#Function
remove_col <- function(df,name){
vec <- which(names(df) %in% name)
df = df[,-vec]
return(df)
}
df_list <- lapply(df_list, remove_col,name=c('X', 'X..x', 'X..y'))
$df1
var1 var2
1 a 1
2 b 1
3 c 0
4 d 0
5 e 1
$df2
var1 var2
1 f 0
2 g 1
3 h 0
4 i 1
5 j 1
if you want to keep only the columns with "var"
lapply(df_list, function(x) x[grepl("var",colnames(x))])
or if you really just want those removed explecitly
lapply(df_list, function(x) x[!grepl("^X$|^X\\.\\.x$|^X\\.\\.y$",colnames(x))])
$df1
var1 var2
1 a 1
2 b 1
3 c 0
4 d 0
5 e 1
$df2
var1 var2
1 f 0
2 g 1
3 h 0
4 i 1
5 j 1
Instead of checking each list element for the same column names, it can be automated if we can extract the intersecting column names across the list. Loop over the list, get the column names, find the intersecting elements with Reduce and use that to subset the columns
nm1 <- Reduce(intersect, lapply(df_list, names))
lapply(df_list, `[`, nm1)
#$df1
# var1 var2
#1 a 1
#2 b 1
#3 c 0
#4 d 0
#5 e 1
#$df2
# var1 var2
#1 f 0
#2 g 1
#3 h 0
#4 i 1
#5 j 1
Or with tidyverse
library(dplyr)
library(purrr)
map(df_list, names) %>%
reduce(intersect) %>%
map(df_list, select, .)

How to group the data by id and get unique values of all columns in R?

I have a table with ID and other columns. I want to group the data by Ids and get the unique values of all columns.
from above table group by ID and get unique(Alt1, Alt2, Alt3)
Resul should be in vector form
A -> 1,2,3,5
B ->1,3,4,5,7
We can get data in long format and for each ID make a list of unique values.
library(dplyr)
library(tidyr)
df1 <- df %>%
pivot_longer(cols = -ID) %>%
group_by(ID) %>%
summarise(value = list(unique(value))) %>%
unnest(value)
df1
# ID value
# <fct> <dbl>
# 1 A 1
# 2 A 3
# 3 A 2
# 4 A 5
# 5 B 1
# 6 B 4
# 7 B 5
# 8 B 3
# 9 B 6
#10 B 7
We can store it as a list if needed using split.
split(df1$value, df1$ID)
#$A
#[1] 1 3 2 5
#$B
#[1] 1 4 5 3 6 7
data.table equivalent of the above would be :
library(Data.table)
setDT(df)
df2 <- melt(df, id.vars = 'ID')[, .(value = list(unique(value))), ID]
unique values are present in df2$value as a vector.
data
df <- data.frame(ID = c('A', 'A', 'B', 'B'),
Alt1 = c(1, 2, 1, 3),
Alt2 = c(3, 5, 4, 6),
Alt3 = c(1, 3, 5, 7))

Subset with all values for a variable in R

I have a Data Frame with a variable with different values for another variable.
Like this:
DataFrame
So, I need a subset when the value of S contain all the possible values of B. In this example, el subset is conformed by S = a and S = b:
Subset
Any idea? Thanks!!
An option would be to group by 'S' and filter the rows having all the unique values of the column 'B' %in% 'B'
library(dplyr)
un1 <- unique(df1$B)
df1 %>%
group_by(S) %>%
filter(all(un1 %in% B))
# A tibble: 8 x 2
# Groups: S [2]
# S B
# <fct> <dbl>
#1 a 1
#2 a 2
#3 a 3
#4 a 4
#5 d 1
#6 d 2
#7 d 3
#8 d 4
Or with data.table
library(data.table)
setDT(df1)[, .SD[all(un1 %in% B)], S]
Or using base R
df1[with(df1, ave(B, S, FUN = function(x) all(un1 %in% x)) == 1),]
data
df1 <- data.frame(S = rep(letters[1:4], c(4, 3, 2, 4)),
B = c(1:4, c(1, 3, 4), 1:2, 1:4))

How to delete duplicate rows (the shorter ones) based on certain columns?

Suppose I have the following df
df <- data.frame(col1 = c(1, 3, 1), col2 = c(2, 4, 2), col3 = c(NA, NA, "c"))
> df
col1 col2 col3
1 1 2 <NA>
2 3 4 <NA>
3 1 2 c
My goal is to delete all duplicate rows based on col1 and col2 such that the longer row "survives". In this case, the first row should be deleted. I tried
df[duplicated(df[, 1:2]), ]
but this gives me only the third row (and not the third and the second one). How to do it properly?
EDIT: The real df has 15 columns, of which the first 13 are used for identifying duplicates. In the last two columns roughly 2/3 of the rows are filled with NAs (the first 13 columns do not contain any NAs). Thus, my example df was misleading in the sense that there are two columns to be excluded for identifying the duplicates. I am sorry for that.
You can try this:
library(dplyr)
df %>% group_by(col1,col2) %>%
slice(which.min(is.na(col3)))
or this :
df %>%
group_by(col1,col2) %>%
arrange(col3) %>%
slice(1)
# # A tibble: 2 x 3
# # Groups: col1, col2 [2]
# col1 col2 col3
# <dbl> <dbl> <fctr>
# 1 1 2 c
# 2 3 4 NA
A GENERAL SOLUTION
with the most general solution there can be only one row per value of col1, see comment below to add col2 to the grouping variables. It assumes all NAs are on the right.
df %>% mutate(nna = df %>% is.na %>% rowSums) %>%
group_by(col1) %>% # or group_by(col1,col2)
slice(which.min(nna)) %>%
select(-nna)
df <- data.frame(col1 = c(1, 3, 1), col2 = c(2, 4, 2), col3 = c(NA, NA, "c"))
df <- df[order(df$col3),]
duplicates <- duplicated(df[,1:2])
duplicates_sub <- subset(df , duplicates == FALSE)
> duplicates_sub
col1 col2 col3
3 1 2 c
2 3 4 <NA>
EDIT: Keep all non-NA rows
df <- data.frame(col1 = c(1, 3, 1,3, 1), col2 = c(2, 4, 2,4, 2), col3 = c("a", NA, "c",NA, "b"))
df <- df[order(df$col3),]
duplicates <- duplicated(df[,1:2]) & is.na(df[,3])
duplicates_sub <- subset(df , duplicates == FALSE)
> duplicates_sub
col1 col2 col3
1 1 2 a
5 1 2 b
3 1 2 c
2 3 4 <NA>
You can sort NAs to the top or bottom before dropping dupes:
# in base, which puts NAs last
odf = df[do.call(order, df), ]
odf[!duplicated(odf[, c("col1", "col2")]), ]
# col1 col2 col3
# 3 1 2 c
# 2 3 4 <NA>
# or with data.table, which puts NAs first
library(data.table)
DF = setorder(data.table(df))
unique(DF, by=c("col1", "col2"), fromLast=TRUE)
# col1 col2 col3
# 1: 1 2 c
# 2: 3 4 NA
This approach cannot be taken with dplyr, which doesn't offer "sort by all columns" in arrange, nor fromLast in distinct.

Sample by groupy with a condition (r)

I need to randomly select a diary for each individual (id) but only for those who filled more than one.
Let us suppose my data look like this
dta = rbind(c(1, 1, 'a'),
c(1, 2, 'a'),
c(1, 3, 'b'),
c(2, 1, 'a'),
c(3, 1, 'b'),
c(3, 2, 'a'),
c(3, 3, 'c'))
colnames(dta) <- c('id', 'DiaryNumber', 'type')
dta = as.data.frame(dta)
dta
id DiaryNumber type
1 1 a
1 2 a
1 3 b
2 1 a
3 1 b
3 2 a
3 3 c
For example, id 1 filled 3 diaries. What I need is to randomly select one of the 3 diaries. Id 2 only filled one diary, so I do not need to do anything with it.
I have no idea how I could do that.
Any ideas ?
You can use sample_n:
library(dplyr)
dta %>% group_by(id) %>% sample_n(1)
## Source: local data frame [3 x 3]
## Groups: id
##
## id DiaryNumber type
## 1 1 2 a
## 2 2 1 a
## 3 3 1 b
Base package:
set.seed(123)
df <- lapply(split(dta, dta$id), function(x) x[sample(nrow(x), 1), ])
do.call("rbind", df)
Output:
id DiaryNumber type
1 1 1 a
2 2 1 a
3 3 2 a

Resources