Dataframe NA conversion to specific items - r

I have a data frame like;
dataframe <- data.frame(ID1=c(NA,2,3,1,NA,2),ID2=c(1,2,3,1,2,2))
Now I want to convert the NA value to the valuable which is the same to the next column valuable like;
dataframe <- data.frame(ID1=c(1,2,3,1,2,2),ID2=c(1,2,3,1,2,2))
I think I should use the if function, but I want use %>% for simplification.
Please teach me.

An ifelse solution
dataframe <- within(dataframe, ID1 <- ifelse(is.na(ID1),ID2,ID1))
such that
> dataframe
ID1 ID2
1 1 1
2 2 2
3 3 3
4 1 1
5 2 2
6 2 2

A straightforward solution is to find out NA values in ID1 and replace them with corresponding values from ID2.
inds <- is.na(dataframe$ID1)
dataframe$ID1[inds] <- dataframe$ID2[inds]
However, since you want a solution with pipes you can use coalesce from dplyr
library(dplyr)
dataframe %>% mutate(ID1 = coalesce(ID1, ID2))
# ID1 ID2
#1 1 1
#2 2 2
#3 3 3
#4 1 1
#5 2 2
#6 2 2

A dplyr (using %>%) solution:
sanitized <- dataframe %>%
mutate(ID1 = ifelse(is.na(ID1), ID2, ID1))

Related

R Compare duplicate values ​for each row in two data sets + in any order

I want to compare whether the 'values set' ​​in each row are the same.
In this case, duplicated and all_equal function are not suitable.
Reproducible Sample Data
a=c(1,1)
b=c(2,2)
c=c(3,3)
d=c(4,5)
df1<-rbind(a,b,c)
df1<-df1 %>% as.data.frame()
df2<-rbind(a,d,b)
df2<-df2 %>% as.data.frame()
> df1
V1 V2
a 1 1
b 2 2
c 3 3
> df2
V1 V2
a 1 1
d 4 5
b 2 2
Expected output
df1$idx1 <- 1:nrow(df1)
df2$idx2 <- 1:nrow(df2)
df1
df2
df3<-full_join(df1,df2,by=c('V1','V2'))
df3
df3$need <- ifelse(is.na(df3$idx2), 'only_df1',
ifelse(is.na(df3$idx1), 'only_df2',
'duplicated'))
> df3
V1 V2 idx1 idx2 need
1 1 1 1 1 duplicated
2 2 2 2 3 duplicated
3 3 3 3 NA only_df1
4 4 5 NA 2 only_df2
I try... but This is complicated.
I think there must be a better way. help!
Since you are already using dplyr, you may use case_when which is easier to understand and write especially when you have lot of conditions.
library(dplyr)
full_join(df1,df2,by=c('V1','V2')) %>%
mutate(need = case_when(is.na(idx2) ~ 'only_df1',
is.na(idx1) ~ 'only_df2',
TRUE ~ 'duplicated'))
# V1 V2 idx1 idx2 need
#1 1 1 1 1 duplicated
#2 2 2 2 3 duplicated
#3 3 3 3 NA only_df1
#4 4 5 NA 2 only_df2
As already mentioned in the comments, your way looks ok. In case you want to see how it could be done in base:
a <- c(1,1)
b <- c(2,2)
c <- c(3,3) #Better don't use existing function names
d <- c(4,5)
df1 <- as.data.frame(rbind(a,b,c))
df2 <- as.data.frame(rbind(a,d,b))
df1$idx1 <- seq_len(nrow(df1)) #seq_len will also work in case nrow=0
df2$idx2 <- seq_len(nrow(df2))
df3 <- merge(df1, df2, all=TRUE)
df3$need <- ifelse(is.na(df3$idx2), "only_df1",
ifelse(is.na(df3$idx1), "only_df2",
"duplicated"))
df3
# V1 V2 idx1 idx2 need
#1 1 1 1 1 duplicated
#2 2 2 2 3 duplicated
#3 3 3 3 NA only_df1
#4 4 5 NA 2 only_df2
We can use
library(arsenal)
summary(comparedf(df1, df2))

Rename dataframe column names by switching string patterns

I have following dataframe and I want to rename the column names to c("WBC_MIN_D7", "WBC_MAX_D7", "DBP_MIN_D3")
> dataf <- data.frame(
+ WBC_D7_MIN=1:4,WBC_D7_MAX=1:4,DBP_D3_MIN=1:4
+ )
> dataf
WBC_D7_MIN WBC_D7_MAX DBP_D3_MIN
1 1 1 1
2 2 2 2
3 3 3 3
4 4 4 4
> names(dataf)
[1] "WBC_D7_MIN" "WBC_D7_MAX" "DBP_D3_MIN"
Probably, the rename_with function in tidyverse can do it, But I cannot figure out how to do it.
You can use capture groups with sub to extract values in order -
names(dataf) <- sub('^(\\w+)_(\\w+)_(\\w+)$', '\\1_\\3_\\2', names(dataf))
Same regex can be used in rename_with -
library(dplyr)
dataf %>% rename_with(~ sub('^(\\w+)_(\\w+)_(\\w+)$', '\\1_\\3_\\2', .))
# WBC_MIN_D7 WBC_MAX_D7 DBP_MIN_D3
#1 1 1 1
#2 2 2 2
#3 3 3 3
#4 4 4 4
You can rename your dataf with your vector with names(yourDF) <- c("A","B",...,"Z"):
names(dataf) <- c("WBC_MIN_D7", "WBC_MAX_D7", "DBP_MIN_D3")

fuzzy_full_join over multiple variables duplicates columns in R

I do a fuzzy_full_join of two tables in R requiring multiple keys to match. Some
rows do not match. The output has duplicated the keys. This does not happen
with a non-fuzzy full join. What is the best way to remove the duplicates? I
have a solution, but it seems cumbersome.
Example:
x<-data.frame("id"=c(1,1,2,2), "time" = c(1,2,1,2), "meas1" = c(1,2,3,4))
y<-data.frame("id"=c(1,1,2,2), "time" =c(1,3,2,4),"meas2"=c(-1,-2,-3,-4))
# compare full_join output with fuzzy_full_join
full_join(x,y,by=c('id'='id','time'='time'))
fuzzy_full_join(x,y,by=c('id'='id','time'='time'),match_fun=list(`==`,`==`))
# make fuzzy_full_join output match full_join output
fuzzy_full_join(x,y,by=c('id'='id','time'='time'),match_fun=list(`==`,`==`)) %>%
mutate(id=if_else(is.na(id.x),id.y,id.x)) %>%
select(-id.x,-id.y) %>%
mutate(time=if_else(is.na(time.x),time.y,time.x)) %>%
select(-time.y,-time.x)
We can use coalesce which might help reduce the code.
library(dplyr)
library(fuzzyjoin)
fuzzy_full_join(x,y,by=c('id'='id','time'='time'),match_fun=list(`==`,`==`)) %>%
mutate(id=coalesce(id.x, id.y), time = coalesce(time.x, time.y)) %>%
select(-matches('\\.x$|\\.y$'))
# meas1 meas2 id time
#1 1 -1 1 1
#2 4 -3 2 2
#3 2 NA 1 2
#4 3 NA 2 1
#5 NA -2 1 3
#6 NA -4 2 4

combine datasets by the value of multiple columns

I'm trying to enter values based on the value of multiple columns from two datasets.
I have my main dataset (df1), with lists of a location and corresponding dates and df2 consists of a list of temperatures at all locations on every possible date. Eg:
df1
Location Date
A 2
B 1
C 1
D 3
B 3
df2
Location Date1Temp Date2Temp Date3Temp
A -5 -4 0
B 2 0 2
C 4 4 5
D 6 3 4
I would like to create a temperature variable in df1, according to the location and date of each observation. Preferably I would like to carry this out with all Temperature data in the same dataframe, but this can be separated and added 'by date' if necessary. With the example data, I would want this to create something like this:
Location Date Temp
A 2 -4
B 1 2
C 1 4
D 3 4
B 3 2
I've been playing around with merge and ifelse, but haven't figured anything out yet.
is it what you need?
library(reshape2)
library(magrittr)
df1 <- data.frame(Location= c("A","B","C","D","B"),Date=c(2,1,1,3,3))
df2 <- data.frame(Location= c("A","B","C","D"),d1t=c(-5,5,4,6),d2t=c(-4,0,4,3),d3t=c(0,2,5,4))
merge(df1,df2) %>% melt(id.vars=c("Location","Date"))
Here's how to do that with dplyr and tidyr.
Basically, you want to use gather to melt the DateXTemp columns from df2 into two columns. Then, you want to use gsub to remove the "Date" and "Temp" strings to get numbers that are comparable to what you have in df1. Since DateXTemp were initially characters, you need to transform the remaining numbers to numeric with as.numeric. I then use left_join to join the tables.
library(dplyr);library(tidyr)
df1 <- data.frame(Location= c("A","B","C","D","B"),Date=c(2,1,1,3,3))
df2 <- data.frame(Location= c("A","B","C","D"),Date1Temp=c(-5,5,4,6),
Date2Temp=c(-4,0,4,3),Date3Temp=c(0,2,5,4))
df2_new <- df2%>%
gather(Date,Temp,Date1Temp:Date3Temp)%>%
mutate(Date=gsub("Date|Temp","",Date))%>%
mutate(Date=as.numeric(Date))
df1%>%left_join(df2_new)
Joining, by = c("Location", "Date")
Location Date Temp
1 A 2 -4
2 B 1 5
3 C 1 4
4 D 3 4
5 B 3 2
EDIT
As suggested by #Sotos, you can do that in one piping like so:
df2%>%
gather(Date,Temp,Date1Temp:Date3Temp)%>%
mutate(Date=gsub("Date|Temp","",Date))%>%
mutate(Date=as.numeric(Date))%>%
left_join(df1,.)
Joining, by = c("Location", "Date")
Location Date Temp
1 A 2 -4
2 B 1 5
3 C 1 4
4 D 3 4
5 B 3 2

How to integrate set of vector in multiple data.frame into one without duplication?

I have position index vector in data.frame objects, but in each data.frame object, the order of position index vector are very different. However, I want to integrate/ merge these data.frame object object in one common data.frame with very specific order and not allow to have duplication in it. Does anyone know any trick for doing this more easily? Can anyone propose possible approach how to accomplish this task?
data
v1 <- data.frame(
foo=c(1,2,3),
bar=c(1,2,2),
bleh=c(1,3,0))
v2 <- data.frame(
bar=c(1,2,3),
foo=c(1,2,0),
bleh=c(3,3,4))
v3 <- data.frame(
bleh=c(1,2,3,4),
foo=c(1,1,2,0),
bar=c(0,1,2,3))
initial output after integrating them:
initial_output <- data.frame(
foo=c(1,2,3,1,2,0,1,1,2,0),
bar=c(1,2,2,1,2,3,0,1,2,3),
bleh=c(1,3,0,3,3,4,1,2,3,4)
)
remove duplication
rmDuplicate_output <- data.frame(
foo=c(1,2,3,1,0,1,1),
bar=c(1,2,2,1,3,0,1),
bleh=c(1,3,0,3,4,1,2)
)
final desired output:
final_output <- data.frame(
foo=c(1,1,1,1,2,3,0),
bar=c(0,1,1,1,2,2,3),
bleh=c(1,1,2,3,3,0,4)
)
How can I get my final desired output easily? Is there any efficient way for doing this sort of manipulation for data.frame object? Thanks
You could also use use mget/ls combo in order to get your data frames programmatically (without typing individual names) and then use data.tables rbindlist and unique functions/method for great efficiency gain (see here and here)
library(data.table)
unique(rbindlist(mget(ls(pattern = "v\\d+")), use.names = TRUE))
# foo bar bleh
# 1: 1 1 1
# 2: 2 2 3
# 3: 3 2 0
# 4: 1 1 3
# 5: 0 3 4
# 6: 1 0 1
# 7: 1 1 2
As a side note, it usually better to keep multiple data.frames in a single list so you could have better control over them
We can use bind_rows from dplyr, remove the duplicates with distinct and arrange by 'bar'
library(dplyr)
bind_rows(v1, v2, v3) %>%
distinct %>%
arrange(bar)
# foo bar bleh
#1 1 0 1
#2 1 1 1
#3 1 1 3
#4 1 1 2
#5 2 2 3
#6 3 2 0
#7 0 3 4
Here is a solution:
# combine dataframes
df = rbind(v1, v2, v3)
# remove duplicated
df = df[! duplicated(df),]
# sort by 'bar' column
df[order(df$bar),]
foo bar bleh
7 1 0 1
1 1 1 1
4 1 1 3
8 1 1 2
2 2 2 3
3 3 2 0
6 0 3 4

Resources