Assign NA to duplicate values within dataframe prior to using reshape - r

I want to assign NA to a duplicate value prior to using reshape in order to avoid duplicates in my wide dataset after reshaping. In the example data frame below, I would like to assign NA to all duplicate values in X1 and X2, but not X3, for each ID in my dataset. This means for ID=3, NA should be assigned to X2 in row 4, and for ID=4 this applies to X1 for row 6 and 8, and to x2 also for row 6 and 8. Values of X3 should remain. I want to assign NA since all rows should remain in the data frame.
df <- read.table(header=TRUE,text =
"ID X1 X2 X3
1 A X 23
2 B Y 4
3 A X 32
3 B X 6
4 A Y 45
4 B Y 7
4 A Z 5
4 B Z 3
")
ID X1 X2 X3
1 1 A X 23
2 2 B Y 4
3 3 A X 32
4 3 B X 6
5 4 A Y 45
6 4 B Y 7
7 4 A Z 5
8 4 B Z 3

duplicated() is useful for identifying duplicates.
df[duplicated(df[c("ID", "X1")]), "X1"] = NA
df[duplicated(df[c("ID", "X2")]), "X2"] = NA
df
# ID X1 X2 X3
# 1 1 A X 23
# 2 2 B Y 4
# 3 3 A X 32
# 4 3 B <NA> 6
# 5 4 A Y 45
# 6 4 B <NA> 7
# 7 4 <NA> Z 5
# 8 4 <NA> <NA> 3

We can use dplyr
library(dplyr)
df %>%
group_by(ID) %>%
mutate_each(funs(replace(., duplicated(.), NA)), X1:X2)
# ID X1 X2 X3
# <int> <fctr> <fctr> <int>
#1 1 A X 23
#2 2 B Y 4
#3 3 A X 32
#4 3 B NA 6
#5 4 A Y 45
#6 4 B NA 7
#7 4 NA Z 5
#8 4 NA NA 3

You could try:
library(data.table)
setDT(df)
df[, c("X1","X2") := .(ifelse(duplicated(X1), NA, X1), ifelse(duplicated(X2), NA, X2)), by = ID]
Result:
ID X1 X2 X3
1: 1 A X 23
2: 2 B Y 4
3: 3 A X 32
4: 3 B NA 6
5: 4 A Y 45
6: 4 B NA 7
7: 4 NA Z 5
8: 4 NA NA 3

Related

How to conditionally update a R tibble using multiple conditions of another tibble

I have two tables. I would like to update the first table using a second table using multiple conditions. In base R I would use if...else type constructs to do this but would like to know how to achieve this using dplyr.
The table to be updated (have a field added) looks like this:
> Intvs
# A tibble: 12 x 3
Group From To
<chr> <dbl> <dbl>
1 A 0 1
2 A 1 2
3 A 2 3
4 A 3 4
5 A 4 5
6 A 5 6
7 B 0 1
8 B 1 2
9 B 2 3
10 B 3 4
11 B 4 5
12 B 5 6
The tibble that I would like to use to make the update looks like this:
>Zns
# A tibble: 2 x 4
Group From To Zone
<chr> <chr> <dbl> <dbl>
1 A X 1 5
2 B Y 3 4
I would like to update the Intvs tibble with the Zns tibble using the fields == Group, >= From, and <= To to control the update. The expected output should look like this
> Intvs
# A tibble: 12 x 4
Group From To Zone
<chr> <dbl> <dbl> <chr>
1 A 0 1 NA
2 A 1 2 X
3 A 2 3 X
4 A 3 4 X
5 A 4 5 X
6 A 5 6 NA
7 B 0 1 NA
8 B 1 2 NA
9 B 2 3 NA
10 B 3 4 Y
11 B 4 5 NA
12 B 5 6 NA
What is the most efficient way to do this using dplyr?
The code below should make the dummy tables Intv and Zns
# load packages
require(tidyverse)
# Intervals table
a <- c(rep("A", 6), rep("B", 6))
b <- c(seq(0,5,1), seq(0,5,1) )
c <- c(seq(1,6,1), seq(1,6,1))
Intvs <- bind_cols(a, b, c)
names(Intvs) <- c("Group", "From", "To")
# Zones table
a <- c("A", "B")
b <- c("X", "Y")
c <- c(1, 3)
d <- c(5, 4)
Zns <- bind_cols(a, b, c, d)
names(Zns) <- c("Group", "From", "To", "Zone")
Using non-equi join from data.table
library(data.table)
setDT(Intvs)[Zns, Zone := Zone, on = .(Group, From >= From, To <= To)]
-output
> Intvs
Group From To Zone
<char> <num> <num> <char>
1: A 0 1 <NA>
2: A 1 2 X
3: A 2 3 X
4: A 3 4 X
5: A 4 5 X
6: A 5 6 <NA>
7: B 0 1 <NA>
8: B 1 2 <NA>
9: B 2 3 <NA>
10: B 3 4 Y
11: B 4 5 <NA>
12: B 5 6 <NA>
This is the closest I get. It is not giving the expected output:
library(dplyr)
left_join(Intvs, Zns, by="Group") %>%
group_by(Group) %>%
mutate(Zone1 = case_when(From.x <= Zone & From.x >= To.y ~ From.y)) %>%
select(Group, From=From.x, To=To.x, Zone = Zone1)
Group From To Zone
<chr> <dbl> <dbl> <chr>
1 A 0 1 NA
2 A 1 2 X
3 A 2 3 X
4 A 3 4 X
5 A 4 5 X
6 A 5 6 X
7 B 0 1 NA
8 B 1 2 NA
9 B 2 3 NA
10 B 3 4 Y
11 B 4 5 Y
12 B 5 6 NA
Not sure why your first row does not give NA, since 0 - 1 is not in the range of 1 - 5.
First left_join the two dataframes using the Group column. Here I assign the suffix "_Zns" to values from the Zns dataframe. Then use a single case_when or (ifelse) statement to assign NA to rows that do not fit the range. Finally, drop the columns that end with Zns.
library(dplyr)
left_join(Intvs, Zns, by = "Group", suffix = c("", "_Zns")) %>%
mutate(Zone = case_when(From >= From_Zns & To <= To_Zns ~ Zone,
TRUE ~ NA_character_)) %>%
select(-ends_with("Zns"))
# A tibble: 12 × 4
Group From To Zone
<chr> <dbl> <dbl> <chr>
1 A 0 1 NA
2 A 1 2 X
3 A 2 3 X
4 A 3 4 X
5 A 4 5 X
6 A 5 6 NA
7 B 0 1 NA
8 B 1 2 NA
9 B 2 3 NA
10 B 3 4 Y
11 B 4 5 NA
12 B 5 6 NA
Data
Note that I have changed your column name order in the Zns dataframe.
a <- c(rep("A", 6), rep("B", 6))
b <- c(seq(0,5,1), seq(0,5,1) )
c <- c(seq(1,6,1), seq(1,6,1))
Intvs <- bind_cols(a, b, c)
names(Intvs) <- c("Group", "From", "To")
# Zones table
a <- c("A", "B")
b <- c("X", "Y")
c <- c(1, 3)
d <- c(5, 4)
Zns <- bind_cols(a, b, c, d)
colnames(Zns) <- c("Group", "Zone", "From", "To")

Reshape a dataframe with X1, Y1,X2,Y2 to X, Y1, Y2 in R

I have a dataframe df with XY combinations as follows
> df <- data.frame(X1=c(1:4),Y1=c(16:13),X2=c(4:7),Y2=c(-1:-4))
> df
X1 Y1 X2 Y2
1 1 16 4 -1
2 2 15 5 -2
3 3 14 6 -3
4 4 13 7 -4
and want to reshape dfto df2by merging X1 and X2to a new variable X adding NA where Y1 or Y2 is left without value.
The result would look like this
> df2
X Y1 Y2
1 1 16 NA
2 2 15 NA
3 3 14 NA
4 4 13 -1
5 5 NA -2
6 6 NA -3
7 7 NA -4
What is the most efficient way to do this?
You can use dplyr::full_join:
df2 <- dplyr::full_join(df[, c("X1", "Y1")], df[, c("X2", "Y2")], by = c("X1" = "X2"))
names(df2)[1] <- "X"
df2
# X Y1 Y2
#1 1 16 NA
#2 2 15 NA
#3 3 14 NA
#4 4 13 -1
#5 5 NA -2
#6 6 NA -3
#7 7 NA -4
Using merge from base R
merge(df[c('X1', 'Y1')], df[c('X2', 'Y2')], by.x = 'X1', by.y = 'X2', all = TRUE)

Transpose multiple columns as column names and fill with values in R

The sample data as following:
x <- read.table(header=T, text="
ID CostType1 Cost1 CostType2 Cost2
1 a 10 c 1
2 b 2 c 20
3 a 1 b 50
4 a 40 c 1
5 c 2 b 30
6 a 60 c 3
7 c 10 d 1
8 a 20 d 2")
I want the second and third columns (CostType1 and CostType 2) to be the the names of new columns and fill the corresponding cost to certain cost type. If there's no match, filled with NA. The ideal format will be following:
a b c d
1 10 NA 1 NA
2 NA 2 20 NA
3 1 50 NA NA
4 40 1 NA NA
5 NA 30 2 NA
6 60 NA 3 NA
7 NA NA 10 1
8 20 NA NA 2
A solution using tidyverse. We can first get how many groups are there. In this example, there are two groups. We can convert each group, combine them, and then summarize the data frame with the first non-NA value in the column.
library(tidyverse)
# Get the group numbers
g <- (ncol(x) - 1)/2
x2 <- map_dfr(1:g, function(i){
# Transform the data frame one group at a time
x <- x %>%
select(ID, ends_with(as.character(i))) %>%
spread(paste0("CostType", i), paste0("Cost", i))
return(x)
}) %>%
group_by(ID) %>%
# Select the first non-NA value if there are multiple values
summarise_all(funs(first(.[!is.na(.)])))
x2
# # A tibble: 8 x 5
# ID a b c d
# <int> <int> <int> <int> <int>
# 1 1 10 NA 1 NA
# 2 2 NA 2 20 NA
# 3 3 1 50 NA NA
# 4 4 40 NA 1 NA
# 5 5 NA 30 2 NA
# 6 6 60 NA 3 NA
# 7 7 NA NA 10 1
# 8 8 20 NA NA 2
A base solution using reshape
x1 <- setNames(x[,c("ID", "CostType1", "Cost1")], c("ID", "CostType", "Cost"))
x2 <- setNames(x[,c("ID", "CostType2", "Cost2")], c("ID", "CostType", "Cost"))
reshape(data=rbind(x1, x2), idvar="ID", timevar="CostType", v.names="Cost", direction="wide")

Selecting top N rows for each group based on value in column

I have dataframe like below :-
x<-c(3,2,1,8,7,11,10,9,7,5,4)
y<-c("a","a","a", "b","b","c","c","c","c","c","c")
z<-c(2,2,2,1,1,3,3,3,3,3,3)
df<-data.frame(x,y,z)
df
x y z
1 3 a 2
2 2 a 2
3 1 a 2
4 8 b 1
5 7 b 1
6 11 c 3
7 10 c 3
8 9 c 3
9 7 c 3
10 5 c 3
11 4 c 3
I want to select top n row for each group by column y where n is provided in column z.
So the output should be like :
output:
x y z
1 3 a 2
2 2 a 2
3 8 b 1
4 11 c 3
5 10 c 3
6 9 c 3
A solution with base R:
# df is split according to y, then we keep only the top "z" value (after ordering x)
# and rbind everything back together:
do.call(rbind,
lapply(split(df, df$y),
function(df1) df1[order(df1$x, decreasing=TRUE), ][1:unique(df1$z), ]))
# x y z
#a.1 3 a 2
#a.2 2 a 2
#b 8 b 1
#c.6 11 c 3
#c.7 10 c 3
#c.8 9 c 3
EDIT:
A much more direct way (still in base R) provided in comment by #mt1022:
df[ave(1:nrow(df), df$y, FUN = seq_along) <= df$z, ]
# x y z
#1 3 a 2
#2 2 a 2
#4 8 b 1
#6 11 c 3
#7 10 c 3
#8 9 c 3
One approach with data.table:
library(data.table)
setDT(df)
df[,.(inc=seq_len(.N)<=z,x,z),by=.(y)][inc==T ,-2]
# y x z
#1: a 3 2
#2: a 2 2
#3: b 8 1
#4: c 11 3
#5: c 10 3
#6: c 9 3
A solution with dplyr that uses do:
df %>%
group_by(y) %>%
do(head(.,as.numeric(unique(.$z))))
I'm posting the solution I was looking for using dplyr. It is based on #HNSKD:
library(dplyr)
x<-c(3,2,1,8,7,11,10,9,7,5,4)
y<-c("a","a","a", "b","b","c","c","c","c","c","c")
z<-c(2,2,2,1,1,3,3,3,3,3,3)
df<-data.frame(x,y,z)
df %>% group_by(y) %>% slice(1:2)
Which returns the first two elements for each y:
# A tibble: 6 x 3
# Groups: y [3]
x y z
<dbl> <fct> <dbl>
1 3 a 2
2 2 a 2
3 8 b 1
4 7 b 1
5 11 c 3
6 10 c 3

Fill missing values with new data R-Python

I have two dataset x and y
> x
a index b
1 1 1 5
2 NA 2 6
3 2 3 NA
4 NA 4 9
> y
index a
1 2 100
2 4 101
>
I would like to fill the missing values of x with the values contained in y.
I have tried to use the merge function but the result is not what I want.
> merge(x,y, by = 'index', all=T)
index a.x b a.y
1 1 1 5 NA
2 2 NA 6 100
3 3 2 7 NA
4 4 NA 9 101
In the real problem there are additional limitations:
1 - y does not fill all the missing values
2 - x and y have in common more variables (so not only a and index)
EDIT : More realistic example
> x
a index b c
1 1 1 5 NA
2 NA 2 6 NA
3 2 3 NA 5
4 NA 4 9 NA
5 NA 5 10 6
> y
index a c
1 2 100 4
2 4 101 NA
>
The solution would be accepted both in python or R
I used your merge idea and did the following using dplyr. I am sure there will be better ways of doing this task.
index <- 1:5
a <- c(1, NA, 2, NA, NA)
b <- c(5,6,NA,9,10)
c <- c(NA,NA,5,NA,6)
ana <- data.frame(index, a,b,c, stringsAsFactors=F)
index <- c(2,4)
a <- c(100, 101)
c <- c(4, NA)
bob <- data.frame(index, a,c, stringsAsFactors=F)
> ana
index a b c
1 1 1 5 NA
2 2 NA 6 NA
3 3 2 NA 5
4 4 NA 9 NA
5 5 NA 10 6
> bob
index a c
1 2 100 4
2 4 101 NA
ana %>%
merge(., bob, by = "index", all = TRUE) %>%
mutate(a.x = ifelse(a.x %in% NA, a.y, a.x)) %>%
mutate(c.x = ifelse(c.x %in% NA, c.y, c.x))
index a.x b c.x a.y c.y
1 1 1 5 NA NA NA
2 2 100 6 4 100 4
3 3 2 NA 5 NA NA
4 4 101 9 NA 101 NA
5 5 NA 10 6 NA NA
I overwrote a.x (ana$$a) using a.y (bob$a) using mutate. I did a similar thing for c.x (ana$c). If you remove a.y and c.y in the end, that will be the outcome you expect, I think.
Try:
xa = x[,c(1,2)]
m1 = merge(y,xa,all=T)
m1 = m1[!duplicated(m1$index),]
m1$b = x$b[match(m1$index, x$index)]
m1$c = x$c[match(m1$index, x$index)]
m1
index a b c
1 1 1 5 NA
2 2 100 6 NA
4 3 2 NA 5
5 4 101 9 NA
7 5 NA 10 6
or, if there many other columns like b and c:
xa = x[,c(1,2)]
m1 = merge(y,xa,all=T)
m1 = m1[!duplicated(m1$index),]
for(nn in names(x)[3:4]) m1[,nn] = x[,nn][match(m1$index, x$index)]
m1
index a b c
1 1 1 5 NA
2 2 100 6 NA
4 3 2 NA 5
5 4 101 9 NA
7 5 NA 10 6
If there are multiple columns to replace, you could try converting from wide to long form as shown in the first two methods and replace in one step
m1 <- merge(x,y, by="index", all=TRUE)
m1L <- reshape(m1, idvar="index", varying=grep("\\.", colnames(m1)), direction="long", sep=".")
row.names(m1L) <- 1:nrow(m1L)
lst1 <- split(m1L, m1L$time)
indx <- is.na(lst1[[1]][,4:5])
lst1[[1]][,4:5][indx] <- lst1[[2]][,4:5][indx]
res <- lst1[[1]][,c(4,1,2,5)]
res
# a index b c
#1 1 1 5 NA
#2 100 2 6 4
#3 2 3 NA 5
#4 101 4 9 NA
#5 NA 5 10 6
Or you could use dplyr with tidyr
library(dplyr)
library(tidyr)
z <- left_join(x, y, by="index") %>%
gather(Var, Val, matches("\\.")) %>%
separate(Var, c("Var1", "Var2"))
indx1 <- which(is.na(z$Val) & z$Var2=="x")
z$Val[indx1] <- z$Val[indx1+nrow(z)/2]
z %>%
spread(Var1, Val) %>%
filter(Var2=="x") %>%
select(-Var2)
# index b a c
#1 1 5 1 NA
#2 2 6 100 4
#3 3 NA 2 5
#4 4 9 101 NA
#5 5 10 NA 6
Or split the columns by matching names before the . and use lapply to replace the NA's.
indx <- grep("\\.", colnames(m1),value=TRUE)
res <- cbind(m1[!names(m1) %in% indx],
sapply(split(indx, gsub("\\..*", "", indx)), function(x) {
x1 <- m1[x]
indx1 <- is.na(x1[,1])
x1[,1][indx1] <- x1[,2][indx1]
x1[,1]} ))
res
# index b a c
#1 1 5 1 NA
#2 2 6 100 4
#3 3 NA 2 5
#4 4 9 101 NA
#5 5 10 NA 6

Resources