replace certain values in df according to several conditions - r

basic question but I am looking for a nice solution (not for loops) for conditional replacement in DF1 by values of DF2 IF several conditions are fulfilled:
DF1
Name Year Val1
A 2010 x1
A 2012 x2
B 2012 x3
C 2015 x4
C 2012 x5
DF2
Name Year Val1
A 2012 y1
B 2012 y2
C 2012 y3
If Year is of a certain value such as 2012 in this case and the Name of DF1 and DF2 are the same then assign Val1 from DF2 to DF1.
I tried several things:
DF1$Val1[DF1$Year=="2012"&DF1$Name==DF2$Name,] <-DF2$Val1
DF1$Val1<-replace(DF1$Val1, DF1$Year=="2012" & DF1$Name==DF2$Name, DF2$Val1)
But I unfortunately get an error because DF1 and DF2 are not of the same length.
Expected:
DF1
Name Year Val1
A 2010 x1
A 2012 y1
B 2012 y2
C 2015 x4
C 2012 y3
THANK YOU FOR YOUR HELP!

We can use a join on the columns with data.table and update the 'Val'
librar(data.table)
setDT(DF1)[DF2, Val1 := i.Val1, on = .(Name, Year)]
DF1
# Name Year Val1
#1: A 2010 x1
#2: A 2012 y1
#3: B 2012 y2
#4: C 2015 x4
#5: C 2012 y3
data
DF1 <- structure(list(Name = c("A", "A", "B", "C", "C"), Year = c(2010L,
2012L, 2012L, 2015L, 2012L), Val1 = c("x1", "x2", "x3", "x4",
"x5")), class = "data.frame", row.names = c(NA, -5L))
DF2 <- structure(list(Name = c("A", "B", "C"), Year = c(2012L, 2012L,
2012L), Val1 = c("y1", "y2", "y3")), class = "data.frame", row.names = c(NA,
-3L))

I think the easiest way to do this is to filter DF2 down and then append it to DF1.
So
DF2 <- dplyr::filter(DF2, Year==2012,
Name %in% unique(DF1$Name)
DF1 <- dplyr::bind_rows(DF1, DF2)

Here are two base R solutions.
- Using match:
inds <- match(data.frame(t(DF2[-3]),stringsAsFactors = FALSE),
data.frame(t(DF1[-3]),stringsAsFactors = FALSE))
DF1$Val1[inds] <- DF2$Val1
such that
> DF1
Name Year Val1
1 A 2010 x1
2 A 2012 y1
3 B 2012 y2
4 C 2015 x4
5 C 2012 y3
- Using merge + subset:
DF1 <- subset(within(merge(DF1,DF2,by=c("Name","Year"),all.x = TRUE),
Val1 <- ifelse(is.na(Val1.y),Val1.x,Val1.y)),
select = names(DF1))
such that
> DF1
Name Year Val1
1 A 2010 x1
2 A 2012 y1
3 B 2012 y2
4 C 2012 y3
5 C 2015 x4

We can left_join df1 and df2 on Name and Year and use coalesce to select non-NA values from the two Val1 columns.
library(dplyr)
DF1 %>%
left_join(DF2, by = c('Name', 'Year')) %>%
mutate(Val1 = coalesce(Val1.y, Val1.x)) %>%
select(names(df1))
# Name Year Val1
#1 A 2010 x1
#2 A 2012 y1
#3 B 2012 y2
#4 C 2015 x4
#5 C 2012 y3

Related

Delete duplicates between groups in R

Thanks in advance for any help.
my Data looks like this:
|year|class|
|---|----|
|2007|a|
|2007|b|
|2007|c|
|2007|d|
|2008|a|
|2008|b|
|2008|e|
|2008|f|
|2009|c|
|2009|d|
|2009|e|
|2009|g|
The goal would be to delete any classes which occure in the previous year, so the final data looks like this:
|year|class|
|---|----|
|2007|a|
|2007|b|
|2007|c|
|2007|d|
|2008|e|
|2008|f|
|2009|c|
|2009|d|
|2009|g|
I tried this code, I intendet to group the data and then delete all within group duplicates but it did not remove everything just a few rows.
Instead of duplicates() I also tried unique() which did not work.
d %>% group_by(class, Group = c(0, cumsum(diff(year) != 1))) %>%
filter(!(duplicated(class, fromLast = TRUE)| duplicated(class))) %>%
ungroup() %>%
select(-Group)
Is there maybe another R function which can look at group differences?
Thanks for any help
Edit: Thanks too all for your very helpfull answers!
Left join DF to itself on class and a year difference of 1 and retain only those rows for which there is no such match.
library(sqldf)
sqldf("select a.*
from DF a
left join DF b on b.class = a.class and b.year = a.year - 1
where b.year is null")
giving:
year class
1 2007 a
2 2007 b
3 2007 c
4 2007 d
5 2008 e
6 2008 f
7 2009 c
8 2009 d
9 2009 g
Note
Lines <- "|year|class|
|2007|a|
|2007|b|
|2007|c|
|2007|d|
|2008|a|
|2008|b|
|2008|e|
|2008|f|
|2009|c|
|2009|d|
|2009|e|
|2009|g|"
DF <- read.table(text = Lines, sep = "|", header = TRUE)[2:3]
using library(data.table)
setDT(df)[, .(class = setdiff(class, df[year==y-1, class])), by=.(y=year)]
# y class
# 1: 2007 a
# 2: 2007 b
# 3: 2007 c
# 4: 2007 d
# 5: 2008 e
# 6: 2008 f
# 7: 2009 c
# 8: 2009 d
# 9: 2009 g
df=df[order(df$class,df$year),]
df$y_diff=c(0,diff(df$year))
df$c_lag=c("x",head(df$class,-1))
df[df$y_diff!=1 | df$class!=df$c_lag,1:2]
year class
1 2007 a
2 2007 b
3 2007 c
9 2009 c
4 2007 d
10 2009 d
7 2008 e
8 2008 f
12 2009 g
Here are some base R solution:
split + for loop
dflst <- unname(split(df, df$year))
for (k in seq_along(dflst)[-1]) {
dflst[[k]] <- subset(dflst[[k]], !class %in% dflst[[k - 1]]$class)
}
dfout <- do.call(rbind, dflst)
merge + subset + is.na
dfout <- subset(merge(
df,
transform(
df,
yr = year + 1
),
by.x = c("year", "class"),
by.y = c("yr", "class"),
all.x = TRUE
),
is.na(year.y),
select = -year.y
)
which gives
year class
1 2007 a
2 2007 b
3 2007 c
4 2007 d
7 2008 e
8 2008 f
9 2009 c
10 2009 d
12 2009 g
data
> dput(df)
structure(list(year = c(2007L, 2007L, 2007L, 2007L, 2008L, 2008L,
2008L, 2008L, 2009L, 2009L, 2009L, 2009L), class = c("a", "b",
"c", "d", "a", "b", "e", "f", "c", "d", "e", "g")), class = "data.frame", row.names = c(NA,
-12L))
An analysis of all the current answer
df=structure(list(year = c(2007L, 2007L, 2007L, 2007L, 2008L, 2008L,
2008L, 2008L, 2009L, 2009L, 2009L, 2009L), class = c("a", "b",
"c", "d", "a", "b", "e", "f", "c", "d", "e", "g")), class = "data.frame", row.names = c(NA,
-12L))
library(sqldf)
library(data.table)
library(dplyr)
library(purrr)
library(microbenchmark)
groth = function() {
sqldf("select a.*
from df a
left join df b on b.class = a.class and b.year = a.year - 1
where b.year is null")
}
thomas1 = function() {
dflst <- unname(split(df, df$year))
for (k in seq_along(dflst)[-1]) {
dflst[[k]] <- subset(dflst[[k]], !class %in% dflst[[k - 1]]$class)
}
dfout <- do.call(rbind, dflst)
}
thomas2 = function() {
dfout <- subset(merge(
df,
transform(
df,
yr = year + 1
),
by.x = c("year", "class"),
by.y = c("yr", "class"),
all.x = TRUE
),
is.na(year.y),
select = -year.y
)
}
dww = function() {
setDT(df)[, .(class = setdiff(class, df[year==y-1, class])), by=.(y=year)]
}
user29 = function() {
df=df[order(df$class,df$year),]
df$y_diff=c(0,diff(df$year))
df$c_lag=c("x",head(df$class,-1))
df[df$y_diff!=1 | df$class!=df$c_lag,1:2]
}
anous = function() {
df %>%
group_by(class) %>%
mutate(dup = n() > 1) %>%
group_split() %>%
map_dfr(~ if(unique(.x$dup) & (.x$year[2] - .x$year[1]) == 1) {
.x %>% slice_head(n = 1)
} else {
.x
}) %>%
select(-dup) %>%
arrange(year)
}
benchmark
set.seed(1)
microbenchmark::microbenchmark(
groth(), thomas1(), thomas2(), dww(), user29(), anous(), times=10)
Unit: microseconds
expr min lq mean median uq max neval
groth() 8864.702 9532.502 10885.691 9774.151 11628.401 14432.101 10
thomas1() 792.801 836.001 1666.511 1024.651 1065.601 7921.401 10
thomas2() 1758.700 2024.700 3172.011 2371.601 3348.701 8032.301 10
dww() 3876.201 4280.400 4953.251 4383.701 5320.101 8807.501 10
user29() 464.601 494.502 1249.081 542.951 643.300 7562.401 10
anous() 10506.801 11091.602 12232.101 11424.801 12889.401 17279.201 10
with a much bigger dataframe, I had to remove thomas2 because it did not work
df=data.frame(
"year"=sample(2000:2020,1e5,replace=T),
"class"=sample(LETTERS[1:20],1e5,replace=T)
)
microbenchmark::microbenchmark(
groth(), thomas1(), dww(), user29(), anous(), times=10)
Unit: milliseconds
expr min lq mean median uq max neval
groth() 1217.9176 1270.225702 1290.86323 1305.06580 1322.3443 1341.0451 10
thomas1() 13.6828 14.331401 17.94286 17.76540 21.2913 23.5265 10
dww() 31.3091 36.660201 41.31367 40.27055 44.5629 54.6295 10
user29() 7.8137 9.481402 11.97380 11.31740 14.2235 16.9593 10
anous() 12.7733 13.266902 14.60760 13.50610 15.1067 19.9610 10
General assumptions
Table is ordered by Year
Case one
For each group of records (grouped by Year) remove Class value if it has appeared in previous Year.
Solution
Transform the data, so for each Year in table, Class becomes a list of all Class values appeared during particular Year (chop());
For each particular Year remove Class value (setdiff) if it has appeared during previous Year (lag(Class));
Transform the Class from list of lists to atomic vector.
Code
library(tidyverse)
dat %>%
chop(Class) %>%
mutate(Class = map2(Class, lag(Class), setdiff)) %>%
unchop(Class)
Output
# Year Class
#1 2007 a
#2 2007 b
#3 2007 c
#4 2007 d
#5 2008 e
#6 2008 f
#7 2009 c
#8 2009 d
#9 2009 g
Case two
This case is more interesting comparing to the previous one, because in order to solve it, one needs to compare current list of Class values, to all of the values of Class appeared during previous years (sic!).
Solution
Transform the data, so for each Year in table, Class becomes a list of all Class values appeared during particular Year (chop());
Create list of Class values so each entry of the list contain the unique set of Class values appeared during particular Year and all the Years before (accumulate(Class, union));
For each particular Year remove Class value (setdiff) if it has appeared during previous Years (lag(...)) as it has been calculated at step 2.
Transform the Class from list of lists to atomic vector.
Code
library(tidyverse)
dat %>%
chop(Class) %>%
mutate(Class = map2(Class, lag(accumulate(Class, union)), setdiff)) %>%
unchop(Class)
Output
# Year Class
#1 2007 a
#2 2007 b
#3 2007 c
#4 2007 d
#5 2008 e
#6 2008 f
#7 2009 g
Data
I have changed the names of the variables, capitalizing first letter. It is against the concept of tidy data, and it bothers me a lot. However, the fact that you do use name class, which is the name of a R's primitive function bothers me even more.
dat <- structure(
list(
Year = c(2007, 2007, 2007, 2007, 2008, 2008, 2009, 2009, 2009),
Class = c("a", "b", "c", "d", "e", "f", "c", "d", "g")
),
class = "data.frame", row.names = c(NA,-9L)
)
You can also use the following tidyverse solution. I would like to thank #ThomasIsCoding for the data:
library(dplyr)
library(purrr)
df %>%
group_by(class) %>%
mutate(dup = n() > 1) %>%
group_split() %>%
map_dfr(~ if(unique(.x$dup) & (.x$year[2] - .x$year[1]) == 1) {
.x %>% slice_head(n = 1)
} else {
.x
}) %>%
select(-dup) %>%
arrange(year)
# A tibble: 9 x 2
year class
<int> <chr>
1 2007 a
2 2007 b
3 2007 c
4 2007 d
5 2008 e
6 2008 f
7 2009 c
8 2009 d
9 2009 g

How fill a dataframe from another one in R?

I want to fill df2 with information from df1.
df1 as below
ID Mutation
1 A
2 B
2 C
3 A
df2 as below
ID A B C
1
2
3
For example, if mutation A is found in ID 1, then I want it in df2 it marked as "Y".
So the df2 result should be
ID A B C
1 Y
2 Y Y
3 Y
I have hundreds of IDs and more than 20 mutations. How can I efficiently achieve this in R? Thanks!
Using data.table you can try
setDT(df)
df2 <- dcast(df,formula = ID~Mutation )
df2[, c("A", "B", "C") := lapply(.SD, function(x) ifelse(is.na(x), " ", "Y")), ID]
df2
#Output
ID A B C
1: 1 Y
2: 2 Y Y
3: 3 Y
Create a new column with value 'Y' and cast the data in wide format.
library(dplyr)
library(tidyr)
df %>%
mutate(value = 'Y') %>%
pivot_wider(names_from = Mutation, values_from = value, values_fill = '')
# ID A B C
# <int> <chr> <chr> <chr>
#1 1 "Y" "" ""
#2 2 "" "Y" "Y"
#3 3 "Y" "" ""
data
df <- structure(list(ID = c(1L, 2L, 2L, 3L), Mutation = c("A", "B",
"C", "A")), class = "data.frame", row.names = c(NA, -4L))

Merge two data frames by row and column names and by group

I have two data frames, df1 and df2, that look as follows:
df1<- data.frame(year, week, X1, X2)
df1
year week X1 X2
1 2010 1 2 3
2 2010 2 8 6
3 2011 1 7 5
firm<-c("X1", "X1", "X2")
year <- c(2010,2010,2011)
week<- c(1, 2, 1)
cost<-c(10,30,20)
df2<- data.frame(firm,year, week, cost)
df2
firm year week cost
1 X1 2010 1 10
2 X1 2010 2 30
3 X2 2011 1 20
I'd like to merge these so the final result (i.e. df3) looks as follows:
df3
firm year week cost Y
1 X1 2010 1 10 2
2 X1 2010 2 30 8
3 X2 2011 1 20 5
Where "Y" is a new variable that reflects the values of X1 and X2 for a particular year and week found in df1.
Is there a way to do this in R? Thank you in advance for your reply.
We can reshape the first dataset to 'long' format and then do a join with the second data
library(dplyr)
library(tidyr)
df1 %>%
pivot_longer(cols = X1:X2, values_to = 'Y', names_to = 'firm') %>%
right_join(df2)
-output
# A tibble: 3 x 5
# year week firm Y cost
# <dbl> <dbl> <chr> <int> <dbl>
#1 2010 1 X1 2 10
#2 2010 2 X1 8 30
#3 2011 1 X2 5 20
data
df1 <- structure(list(year = c(2010L, 2010L, 2011L), week = c(1L, 2L,
1L), X1 = c(2L, 8L, 7L), X2 = c(3L, 6L, 5L)), class = "data.frame",
row.names = c("1",
"2", "3"))
df2 <- structure(list(firm = c("X1", "X1", "X2"), year = c(2010, 2010,
2011), week = c(1, 2, 1), cost = c(10, 30, 20)), class = "data.frame",
row.names = c(NA,
-3L))
Here is a base R option (borrow data from #akrun, thanks!)
q <- startsWith(names(df1),"X")
v <- cbind(df1[!q],stack(df1[q]),row.names = NULL)
df3 <- merge(setNames(v,c(names(df1)[!q],"Y","firm")),df2)
which gives
> df3
year week firm Y cost
1 2010 1 X1 2 10
2 2010 2 X1 8 30
3 2011 1 X2 5 20

gather on first two rows

I have some poorly formatted data that I must work with. It contains two identifiers in the first two rows, followed by the data. The data looks like:
V1 V2 V3
1 Date 12/16/18 12/17/18
2 Equip a b
3 x1 1 2
4 x2 3 4
5 x3 5 6
I want to gather the data to make it tidy, but gathering only works when you have single column names. I've tried looking at spreading as well. The only solutions I've come up with are very hacky and don't feel right. Is there an elegant way to deal with this?
Here's what I want:
Date Equip metric value
1 12/16/18 a x1 1
2 12/16/18 a x2 3
3 12/16/18 a x3 5
4 12/17/18 b x1 2
5 12/17/18 b x2 4
6 12/17/18 b x3 6
This approach gets me close, but I don't know how to deal with the poor formatting (no header, no row names). It should be easy to gather if the formatting was proper.
> as.data.frame(t(df))
V1 V2 V3 V4 V5
V1 Date Equip x1 x2 x3
V2 12/16/18 a 1 3 5
V3 12/17/18 b 2 4 6
And here's the dput
structure(list(V1 = c("Date", "Equip", "x1", "x2", "x3"), V2 = c("12/16/18",
"a", "1", "3", "5"), V3 = c("12/17/18", "b", "2", "4", "6")), class = "data.frame", .Names = c("V1",
"V2", "V3"), row.names = c(NA, -5L))
Thanks for posting a nicely reproducible question. Here's some gentle tidyr/dplyr massaging.
library(tidyr)
df %>%
gather(key = measure, value = value, -V1) %>%
spread(key = V1, value = value) %>%
dplyr::select(-measure) %>%
gather(key = metric, value = value, x1:x3) %>%
dplyr::arrange(Date, Equip, metric)
#> Date Equip metric value
#> 1 12/16/18 a x1 1
#> 2 12/16/18 a x2 3
#> 3 12/16/18 a x3 5
#> 4 12/17/18 b x1 2
#> 5 12/17/18 b x2 4
#> 6 12/17/18 b x3 6
Updated for tidyr v1.0.0:
This is just a little bit cleaner syntax with the pivot functions.
df %>%
pivot_longer(cols = -V1) %>%
pivot_wider(names_from = V1) %>%
pivot_longer(cols = matches("x\\d"), names_to = "metric") %>%
dplyr::select(-name)
You can using reshape
library(reshape)
row.names(df) = df$V1
df$V1 = NULL
df = melt(data.frame(t(df)),id.var = c('Date','Equip'))
df[order(df$Date),]
Date Equip variable value
1 12/16/18 a x1 1
3 12/16/18 a x2 3
5 12/16/18 a x3 5
2 12/17/18 b x1 2
4 12/17/18 b x2 4
6 12/17/18 b x3 6
Here's another way starting from your approach using t(). We can replace the headers from the first row and then drop the first row, allowing just a single gather which might be more intuitive.
library(tidyverse)
df <- structure(list(V1 = c("Date", "Equip", "x1", "x2", "x3"), V2 = c(
"12/16/18",
"a", "1", "3", "5"
), V3 = c("12/17/18", "b", "2", "4", "6")), class = "data.frame", .Names = c(
"V1",
"V2", "V3"
), row.names = c(NA, -5L))
df %>%
t() %>%
`colnames<-`(.[1, ]) %>%
`[`(-1, ) %>%
as_tibble() %>%
gather("metric", "value", x1:x3) %>%
arrange(Date, Equip, metric)
#> # A tibble: 6 x 4
#> Date Equip metric value
#> <chr> <chr> <chr> <chr>
#> 1 12/16/18 a x1 1
#> 2 12/16/18 a x2 3
#> 3 12/16/18 a x3 5
#> 4 12/17/18 b x1 2
#> 5 12/17/18 b x2 4
#> 6 12/17/18 b x3 6
Created on 2018-04-20 by the reprex package (v0.2.0).

Two by two matching between dataframes in r

I need to combine two data frames (df1 and df2) by matching up two site columns of each data frame to produce a third data frame (df3).
df1 = data.frame(Site.1=c("A","A","B"),
Site.2=c("B","C","C"),
Score1=c(60,70,80))
df1
Site.1 Site.2 Score1
1 A B 60
2 A C 70
3 B C 80
df2 = data.frame(Site.1=c("B","A","A"),
Site.2=c("C","B","C"),
Score2=c(10,20,30))
df2
Site.1 Site.2 Score2
1 B C 10
2 A B 20
3 A C 30
df3 = data.frame(Site.1=c("A","A","B"),
Site.2=c("B","C","C"),
Score1=c(60,70,80),
Score2=c(20,30,10))
df3
Site.1 Site.2 Score1 Score2
1 A B 60 20
2 A C 70 30
3 B C 80 10
You want the merge function. Since your column names that you want to match on already have the same name you don't even need to do anything special. If that wasn't the case you would want to look into the by.x and by.y parameters that merge takes.
df1 = data.frame(Site.1=c("A","A","B"),Site.2=c("B","C","C"),Score1=c(60,70,80))
df2 = data.frame(Site.1=c("B","A","A"),Site.2=c("C","B","C"), Score2=c(10,20,30))
df3 = data.frame(Site.1=c("A","A","B"),Site.2=c("B","C","C"), Score1=c(60,70,80),Score2=c(20,30,10))
df3
# Merge gives you what you want
merge(df1, df2)
dplyr may be helpful here.
library(dplyr)
df1 = data.frame(Site.1 = c("A", "A", "B"),
Site.2 = c("B", "C", "C"),
Score1 = c(60, 70, 80))
df2 = data.frame(Site.1 = c("B", "A", "A"),
Site.2 = c("C", "B", "C"),
Score2 = c(10, 20, 30))
inner_join(df1, df2)

Resources