Delete duplicates between groups in R - r

Thanks in advance for any help.
my Data looks like this:
|year|class|
|---|----|
|2007|a|
|2007|b|
|2007|c|
|2007|d|
|2008|a|
|2008|b|
|2008|e|
|2008|f|
|2009|c|
|2009|d|
|2009|e|
|2009|g|
The goal would be to delete any classes which occure in the previous year, so the final data looks like this:
|year|class|
|---|----|
|2007|a|
|2007|b|
|2007|c|
|2007|d|
|2008|e|
|2008|f|
|2009|c|
|2009|d|
|2009|g|
I tried this code, I intendet to group the data and then delete all within group duplicates but it did not remove everything just a few rows.
Instead of duplicates() I also tried unique() which did not work.
d %>% group_by(class, Group = c(0, cumsum(diff(year) != 1))) %>%
filter(!(duplicated(class, fromLast = TRUE)| duplicated(class))) %>%
ungroup() %>%
select(-Group)
Is there maybe another R function which can look at group differences?
Thanks for any help
Edit: Thanks too all for your very helpfull answers!

Left join DF to itself on class and a year difference of 1 and retain only those rows for which there is no such match.
library(sqldf)
sqldf("select a.*
from DF a
left join DF b on b.class = a.class and b.year = a.year - 1
where b.year is null")
giving:
year class
1 2007 a
2 2007 b
3 2007 c
4 2007 d
5 2008 e
6 2008 f
7 2009 c
8 2009 d
9 2009 g
Note
Lines <- "|year|class|
|2007|a|
|2007|b|
|2007|c|
|2007|d|
|2008|a|
|2008|b|
|2008|e|
|2008|f|
|2009|c|
|2009|d|
|2009|e|
|2009|g|"
DF <- read.table(text = Lines, sep = "|", header = TRUE)[2:3]

using library(data.table)
setDT(df)[, .(class = setdiff(class, df[year==y-1, class])), by=.(y=year)]
# y class
# 1: 2007 a
# 2: 2007 b
# 3: 2007 c
# 4: 2007 d
# 5: 2008 e
# 6: 2008 f
# 7: 2009 c
# 8: 2009 d
# 9: 2009 g

df=df[order(df$class,df$year),]
df$y_diff=c(0,diff(df$year))
df$c_lag=c("x",head(df$class,-1))
df[df$y_diff!=1 | df$class!=df$c_lag,1:2]
year class
1 2007 a
2 2007 b
3 2007 c
9 2009 c
4 2007 d
10 2009 d
7 2008 e
8 2008 f
12 2009 g

Here are some base R solution:
split + for loop
dflst <- unname(split(df, df$year))
for (k in seq_along(dflst)[-1]) {
dflst[[k]] <- subset(dflst[[k]], !class %in% dflst[[k - 1]]$class)
}
dfout <- do.call(rbind, dflst)
merge + subset + is.na
dfout <- subset(merge(
df,
transform(
df,
yr = year + 1
),
by.x = c("year", "class"),
by.y = c("yr", "class"),
all.x = TRUE
),
is.na(year.y),
select = -year.y
)
which gives
year class
1 2007 a
2 2007 b
3 2007 c
4 2007 d
7 2008 e
8 2008 f
9 2009 c
10 2009 d
12 2009 g
data
> dput(df)
structure(list(year = c(2007L, 2007L, 2007L, 2007L, 2008L, 2008L,
2008L, 2008L, 2009L, 2009L, 2009L, 2009L), class = c("a", "b",
"c", "d", "a", "b", "e", "f", "c", "d", "e", "g")), class = "data.frame", row.names = c(NA,
-12L))

An analysis of all the current answer
df=structure(list(year = c(2007L, 2007L, 2007L, 2007L, 2008L, 2008L,
2008L, 2008L, 2009L, 2009L, 2009L, 2009L), class = c("a", "b",
"c", "d", "a", "b", "e", "f", "c", "d", "e", "g")), class = "data.frame", row.names = c(NA,
-12L))
library(sqldf)
library(data.table)
library(dplyr)
library(purrr)
library(microbenchmark)
groth = function() {
sqldf("select a.*
from df a
left join df b on b.class = a.class and b.year = a.year - 1
where b.year is null")
}
thomas1 = function() {
dflst <- unname(split(df, df$year))
for (k in seq_along(dflst)[-1]) {
dflst[[k]] <- subset(dflst[[k]], !class %in% dflst[[k - 1]]$class)
}
dfout <- do.call(rbind, dflst)
}
thomas2 = function() {
dfout <- subset(merge(
df,
transform(
df,
yr = year + 1
),
by.x = c("year", "class"),
by.y = c("yr", "class"),
all.x = TRUE
),
is.na(year.y),
select = -year.y
)
}
dww = function() {
setDT(df)[, .(class = setdiff(class, df[year==y-1, class])), by=.(y=year)]
}
user29 = function() {
df=df[order(df$class,df$year),]
df$y_diff=c(0,diff(df$year))
df$c_lag=c("x",head(df$class,-1))
df[df$y_diff!=1 | df$class!=df$c_lag,1:2]
}
anous = function() {
df %>%
group_by(class) %>%
mutate(dup = n() > 1) %>%
group_split() %>%
map_dfr(~ if(unique(.x$dup) & (.x$year[2] - .x$year[1]) == 1) {
.x %>% slice_head(n = 1)
} else {
.x
}) %>%
select(-dup) %>%
arrange(year)
}
benchmark
set.seed(1)
microbenchmark::microbenchmark(
groth(), thomas1(), thomas2(), dww(), user29(), anous(), times=10)
Unit: microseconds
expr min lq mean median uq max neval
groth() 8864.702 9532.502 10885.691 9774.151 11628.401 14432.101 10
thomas1() 792.801 836.001 1666.511 1024.651 1065.601 7921.401 10
thomas2() 1758.700 2024.700 3172.011 2371.601 3348.701 8032.301 10
dww() 3876.201 4280.400 4953.251 4383.701 5320.101 8807.501 10
user29() 464.601 494.502 1249.081 542.951 643.300 7562.401 10
anous() 10506.801 11091.602 12232.101 11424.801 12889.401 17279.201 10
with a much bigger dataframe, I had to remove thomas2 because it did not work
df=data.frame(
"year"=sample(2000:2020,1e5,replace=T),
"class"=sample(LETTERS[1:20],1e5,replace=T)
)
microbenchmark::microbenchmark(
groth(), thomas1(), dww(), user29(), anous(), times=10)
Unit: milliseconds
expr min lq mean median uq max neval
groth() 1217.9176 1270.225702 1290.86323 1305.06580 1322.3443 1341.0451 10
thomas1() 13.6828 14.331401 17.94286 17.76540 21.2913 23.5265 10
dww() 31.3091 36.660201 41.31367 40.27055 44.5629 54.6295 10
user29() 7.8137 9.481402 11.97380 11.31740 14.2235 16.9593 10
anous() 12.7733 13.266902 14.60760 13.50610 15.1067 19.9610 10

General assumptions
Table is ordered by Year
Case one
For each group of records (grouped by Year) remove Class value if it has appeared in previous Year.
Solution
Transform the data, so for each Year in table, Class becomes a list of all Class values appeared during particular Year (chop());
For each particular Year remove Class value (setdiff) if it has appeared during previous Year (lag(Class));
Transform the Class from list of lists to atomic vector.
Code
library(tidyverse)
dat %>%
chop(Class) %>%
mutate(Class = map2(Class, lag(Class), setdiff)) %>%
unchop(Class)
Output
# Year Class
#1 2007 a
#2 2007 b
#3 2007 c
#4 2007 d
#5 2008 e
#6 2008 f
#7 2009 c
#8 2009 d
#9 2009 g
Case two
This case is more interesting comparing to the previous one, because in order to solve it, one needs to compare current list of Class values, to all of the values of Class appeared during previous years (sic!).
Solution
Transform the data, so for each Year in table, Class becomes a list of all Class values appeared during particular Year (chop());
Create list of Class values so each entry of the list contain the unique set of Class values appeared during particular Year and all the Years before (accumulate(Class, union));
For each particular Year remove Class value (setdiff) if it has appeared during previous Years (lag(...)) as it has been calculated at step 2.
Transform the Class from list of lists to atomic vector.
Code
library(tidyverse)
dat %>%
chop(Class) %>%
mutate(Class = map2(Class, lag(accumulate(Class, union)), setdiff)) %>%
unchop(Class)
Output
# Year Class
#1 2007 a
#2 2007 b
#3 2007 c
#4 2007 d
#5 2008 e
#6 2008 f
#7 2009 g
Data
I have changed the names of the variables, capitalizing first letter. It is against the concept of tidy data, and it bothers me a lot. However, the fact that you do use name class, which is the name of a R's primitive function bothers me even more.
dat <- structure(
list(
Year = c(2007, 2007, 2007, 2007, 2008, 2008, 2009, 2009, 2009),
Class = c("a", "b", "c", "d", "e", "f", "c", "d", "g")
),
class = "data.frame", row.names = c(NA,-9L)
)

You can also use the following tidyverse solution. I would like to thank #ThomasIsCoding for the data:
library(dplyr)
library(purrr)
df %>%
group_by(class) %>%
mutate(dup = n() > 1) %>%
group_split() %>%
map_dfr(~ if(unique(.x$dup) & (.x$year[2] - .x$year[1]) == 1) {
.x %>% slice_head(n = 1)
} else {
.x
}) %>%
select(-dup) %>%
arrange(year)
# A tibble: 9 x 2
year class
<int> <chr>
1 2007 a
2 2007 b
3 2007 c
4 2007 d
5 2008 e
6 2008 f
7 2009 c
8 2009 d
9 2009 g

Related

R - cleaning data with repeated columns for different locations

#Edited to make my data more similar to the data I'm working with and example of what I have tried
I am working with a Qualtrics survey where blocks of questions repeat themselves based on previous questions using a function in the survey build called "loop and merge". I'm trying to pull out like questions and then use rbind so that each question only shows up once in a column. I have a basic example below, however in my actual data, the repeats happen 36 times.
example data frame:
capacity_1 <- data.frame("1_q1" = 1:4,
"1_q2" = c("a", "b", "c", "d"),
'1_q3' = 10:13,
'1_q4' = 100:103,
'1_q5' = 110:113,
'1_q6' = 11:14,
"2_q1" = 22:25,
"2_q2" = c("i", "j", "k", "l"),
'2_q3' = 20:23,
'2_q4' = 200:203,
'2_q5' = 210:213,
'2_q6' = 21:24,
"3_q1" = 90:93,
"3_q2" = c("p", "q", "r", "s"),
'3_q3' = 10:13,
'3_q4' = 300:303,
'3_q5' = 310:313,
'3_q6' = 31:34,check.names = FALSE)
note that the "1_" at the start of "1_q1" is the county's reference number
What I could do but that is inefficient, especially since my actual data repeats these questions 36 times:
dat_1 <- dat %>%
select(1:2) %>%
rename(q = 1:2) %>%
mutate("county" = 1)
dat_2 <- dat %>%
select(3:4) %>%
rename(q = 1:2) %>%
mutate("county" = 2)
dat_3 <- dat %>%
select(5:6) %>%
rename(q = 1:2)%>%
mutate("county" = 3)
dat_final <- rbind(dat_1, dat_2, dat_3)
the "dat_final" data frame is what I'd like the data to look like, but also have formatted again here:
dat_clean <- data.frame("q1" = c(1:4, 22:25, 90:93),
"q2" = c("a", "b", "c", "d",
"i", "j", "k", "l",
"p", "q", "r", "s"),
"county" = c(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3))
Update - Tried suggestion below, and get the error "error in "set_names()" the size of 'nm' (6) must be compatible with the size of 'x'(2)
do.call(
rbind,
lapply(seq(1,ncol(capacity_1),6), \(i) {
capacity_1 %>%
select(c(i,i+5)) %>%
rename_all(~c("capacity_outpatient", "capacity_inpatient", "capacity_housing",
"capacity_recovery", "capacity_demand", "capacity_notes")) %>%
mutate(county=(i+5)/6)
})
)
You can do the following, which uses a seq from 1 to ncol(dat), by 2:
do.call(
rbind,
lapply(seq(1,ncol(dat),2), \(i) {
dat %>% select(c(i,i+1)) %>% rename_all(~c("q1","q2")) %>% mutate(county=(i+1)/2)
})
)
Output:
q1 q2 county
1 1 a 1
2 2 b 1
3 3 c 1
4 4 d 1
5 22 i 2
6 23 j 2
7 24 k 2
8 25 l 2
9 90 p 3
10 91 q 3
11 92 r 3
12 93 s 3
Another approach, with data.table
library(data.table)
setDT(dat)
rbindlist(lapply(seq(1,ncol(dat),2), \(i) {
setnames(dat[,i:(i+1)],c("q1","q2"))
}), use.names=F,idcol = "county")
Output:
county q1 q2
1: 1 1 a
2: 1 2 b
3: 1 3 c
4: 1 4 d
5: 2 22 i
6: 2 23 j
7: 2 24 k
8: 2 25 l
9: 3 90 p
10: 3 91 q
11: 3 92 r
12: 3 93 s
A solution using dplyr, purrr, stringr - This solution is not affected by columns orders, number of q columns. It just use the perfix as base for processing data.
library(dplyr)
library(purrr)
library(stringr)
dat <- data.frame("1_q1" = 1:4,
"1_q2" = c("a", "b", "c", "d"),
"2_q1" = 22:25,
"2_q2" = c("i", "j", "k", "l"),
"3_q1" = 90:93,
"3_q2" = c("p", "q", "r", "s"), check.names = FALSE)
# Here is the indexes of county that want to extract from df
county_index <- c("1", "2", "3")
# Function that take index as input and will extract data from `dat` df
edit_df <- function(index) {
dat %>%
# select column start with index prefix
select(matches(paste0(index, "_"))) %>%
# remove the index prefix from string
rename_all(~ str_replace(., regex("^\\d+_", ignore_case = TRUE), "")) %>%
# add county column with the input inex
mutate("county" = as.numeric(index))
}
Result using purrr::map_dfr
# map the county index that want to extract from original df and edit_df function
dat_clean <- map_dfr(.x = county_index, .f = edit_df)
dat_clean
#> q1 q2 county
#> 1 1 a 1
#> 2 2 b 1
#> 3 3 c 1
#> 4 4 d 1
#> 5 22 i 2
#> 6 23 j 2
#> 7 24 k 2
#> 8 25 l 2
#> 9 90 p 3
#> 10 91 q 3
#> 11 92 r 3
#> 12 93 s 3
Created on 2022-05-25 by the reprex package (v2.0.1)

Unnest list of lists of data frames, containing NAs

I have a list of nested data, containing lists with two data frames each looking like this:
mylist <- list(
list(
p = data.frame(
id = "01",
stringsAsFactors = F
),
c = data.frame(
text = c("one", "two"),
from = c("A", "B"),
stringsAsFactors = F
)
),
list(
p = data.frame(
id = "02",
stringsAsFactors = F
),
c = data.frame(
text = c("three", "four", "five"),
from = c("C", "D", "E"),
stringsAsFactors = F
)
),
list(
p = data.frame(
id = "03",
stringsAsFactors = F
),
c = data.frame(
text = logical(0),
from = logical(0)
)
)
)
I want to flatten this list into a dataframe, with one row per observation from the "c" data frames, a colum that indicates the "id" of the level above stored in the "p" dataframes, and non-observations filled with NAs. The result should look like this:
df <- data.frame(
p.id = c("01", "01", "02", "02", "02", "03"),
c.text = c("one", "two", "three", "four", "five", NA),
c.from = c("A", "B", "C", "D", "E", NA)
)
#
# p.id c.text c.from
# 01 one A
# 01 two B
# 02 three C
# 02 four D
# 02 five E
# 03 <NA> <NA>
as.data.frame() provides a very simple solution that comes very close to the desired result, but breaks when there are 0 observations in the "c" data frame.
mylist[[1]] %>% as.data.frame()
mylist[[3]] %>% as.data.frame()
I am aware that are similar questions on stackoverflow - but I read several threads closely, have tried different things from bind_rows(), to jsonlite::flatten(), tidr::unnest(), or data.table::rbindlist() - but have not make it work.
Help is highly appreciated!
Here is a tidyverse solution:
library(tidyverse)
map(mylist, ~as_tibble(.)) %>%
enframe() %>%
unnest_longer(value)
Which gives us:
# A tibble: 6 x 2
name value$p$id $c$text $$from
<int> <chr> <chr> <chr>
1 1 01 one A
2 1 01 two B
3 2 02 three C
4 2 02 four D
5 2 02 five E
6 3 NA NA NA
I create a helper function to combine p and c:
foo <- function(x) {
a <- x[[1]]
b <- x[[2]]
if (nrow(b) == 0) b[1, ] <- NA
return(cbind(a, b))
}
Then I run the helper function on each element and bind the rows:
do.call(rbind, lapply(mylist, foo))
The result:
> do.call(rbind, lapply(mylist, foo))
id text from
1 01 one A
2 01 two B
3 02 three C
4 02 four D
5 02 five E
6 03 <NA> <NA>
P.S. The same result using the R base pipe:
lapply(mylist, foo) |> do.call(what = rbind)

geom_bar two datasets together in R

I have two dataframes with two columns each, that I would like to plot together as a barplot using ggplot in R as shown below:
How can I do this using dplyr in R?
Sample Data:
DF1
Code Count_2020
A 1
B 2
C 3
D 4
E 5
F 6
DF2
Code Count_2021
A 4
B 8
C 6
D 8
E 10
F 12
So, I first thought of merging the two dataframes into one using dplyr::inner_join, and I got a new dataframe as shown below:
Code Count_2021 Count_2020
A 4 1
B 8 2
C 6 3
D 8 4
E 10 5
F 12 6
Next I thought of using dplyr::gather to plot the count data from both years together as Type and Value, but this messed up the gathered dataframe as the output changed to:
Type Value
Code A
Code B
Code C
Code D
Code E
Code F
Code I tried
library(tidyverse)
# Merge DF1 and DF2
DF = inner_join(DF1, DF2)
# Gather data for plotting
Gathered_DF= DF%>% dplyr::select(Code, Count_2020, Count_2021) %>%
gather(key = Type, value = Value) # Output not as expected, stuck!!
We can reshape to 'long' format with pivot_longer after the join and then use geom_col in ggplot2 with position specified as 'dodge' and fill as 'Year`
library(dplyr)
library(tidyr)
library(ggplot2)
inner_join(DF1, DF2) %>%
pivot_longer(cols = -Code, names_to = 'Year', names_prefix = 'Count_') %>%
ggplot(aes(x = Code, y = value, fill = Year)) +
geom_col(position = 'dodge') +
theme_bw()
-output
data
DF1 <- structure(list(Code = c("A", "B", "C", "D", "E", "F"),
Count_2020 = 1:6), class = "data.frame", row.names = c(NA,
-6L))
DF2 <- structure(list(Code = c("A", "B", "C", "D", "E", "F"), Count_2021 = c(4L,
8L, 6L, 8L, 10L, 12L)), class = "data.frame", row.names = c(NA,
-6L))
You can use pivot_longer instead of gather as it is superseded in tidyr 1.1.3
library(tidyverse)
df1 <- data.frame(Code = c("A", "B", "C", "D", "E", "F"),
Count_2020 = c(1,2,3,4,5,6))
df2 <- data.frame(Code = c("A", "B", "C", "D", "E", "F"),
Count_2021 = c(4, 8, 6, 8, 10, 12))
df_joined <- df1 %>%
inner_join(df2, by = "Code") %>%
pivot_longer(cols = !Code, names_to = "Year", names_prefix = "Count_", values_to = "Count")
df_joined
#> # A tibble: 12 x 3
#> Code Year Count
#> <fct> <chr> <dbl>
#> 1 A 2020 1
#> 2 A 2021 4
#> 3 B 2020 2
#> 4 B 2021 8
#> 5 C 2020 3
#> 6 C 2021 6
#> 7 D 2020 4
#> 8 D 2021 8
#> 9 E 2020 5
#> 10 E 2021 10
#> 11 F 2020 6
#> 12 F 2021 12
ggplot(df_joined, aes(x = Code, y = Count, fill = Year)) +
geom_bar(stat = "identity", position = "dodge")
In the code above, the argument inside pivot_longer are:
cols = !Code it means the column to be pivoted that is all column except Code
names_to = "Year" it means the name of column to be created for grouping
names_prefix = "Count_" is used to remove the string "Count_" from the created column "Year"
values_to = "Count" it means the name of column to created for stored value from each group.
You can learn more about this function by simply call ?pivot_longer
Use pivot_longer() to reshape your data then plot using ggplot.
Bonus: to add text on the bars, use geom_bar_text from the ggfittext package
library(tidyverse)
DF1 <- read.table(text = "Code Count_2020
A 1
B 2
C 3
D 4
E 5
F 6", header = TRUE)
DF2 <- read.table(text = "Code Count_2021
A 4
B 8
C 6
D 8
E 10
F 12", header = TRUE)
DF <- left_join(DF1, DF2, by = "Code")
DF_long <- DF %>%
pivot_longer(-Code,
names_to = c("tmp", "Year"),
names_sep = "\\_",
values_to = "Count") %>%
select(-tmp)
DF_long
#> # A tibble: 12 x 3
#> Code Year Count
#> <chr> <chr> <int>
#> 1 A 2020 1
#> 2 A 2021 4
#> 3 B 2020 2
#> 4 B 2021 8
#> 5 C 2020 3
#> 6 C 2021 6
#> 7 D 2020 4
#> 8 D 2021 8
#> 9 E 2020 5
#> 10 E 2021 10
#> 11 F 2020 6
#> 12 F 2021 12
plt <- ggplot(DF_long, aes(x = Code,
y = Count,
fill = Year)) +
geom_col(position = position_dodge(width = 0.9)) +
theme_minimal()
plt
library(ggfittext)
plt +
geom_bar_text(position = "dodge", reflow = TRUE)
Created on 2021-08-05 by the reprex package (v2.0.1)
I also found another way of doing it:
library(tidyverse)
DF1 = data.frame(Code = c("A", "B", "C", "D", "E", "F"),
Count_2020 = c(1,2,3,4,5,6))
DF2 = data.frame(Code = c("A", "B", "C", "D", "E", "F"),
Count_2021 = c(4, 8, 6, 8, 10, 12))
DF_Merged =
inner_join(DF1, DF2)
DFF_Merged = DF_Merged %>% dplyr::select(Code, Count_2020, Count_2021) %>%
gather(key = Type, value = Value, -Code) %>%
mutate(Type = ifelse(Type == "Count_2020", "2020", "2021"))
DFF_Merged %>%
ggplot(aes(x = reorder(Code,Value), y = Value, fill = Type,
text = paste("Count:", Value,
"<br>", "Offense Code:", Code,
"<br>", "Year:", Type))) +
geom_col(position = "dodge", show.legend = FALSE) +
xlab("Offense Code") +
ylab("Count") +
ggtitle("Arrest Counts for Group 1 in Year 2020 and 2021") +
theme(axis.text=element_text(size=8))
Result

replace certain values in df according to several conditions

basic question but I am looking for a nice solution (not for loops) for conditional replacement in DF1 by values of DF2 IF several conditions are fulfilled:
DF1
Name Year Val1
A 2010 x1
A 2012 x2
B 2012 x3
C 2015 x4
C 2012 x5
DF2
Name Year Val1
A 2012 y1
B 2012 y2
C 2012 y3
If Year is of a certain value such as 2012 in this case and the Name of DF1 and DF2 are the same then assign Val1 from DF2 to DF1.
I tried several things:
DF1$Val1[DF1$Year=="2012"&DF1$Name==DF2$Name,] <-DF2$Val1
DF1$Val1<-replace(DF1$Val1, DF1$Year=="2012" & DF1$Name==DF2$Name, DF2$Val1)
But I unfortunately get an error because DF1 and DF2 are not of the same length.
Expected:
DF1
Name Year Val1
A 2010 x1
A 2012 y1
B 2012 y2
C 2015 x4
C 2012 y3
THANK YOU FOR YOUR HELP!
We can use a join on the columns with data.table and update the 'Val'
librar(data.table)
setDT(DF1)[DF2, Val1 := i.Val1, on = .(Name, Year)]
DF1
# Name Year Val1
#1: A 2010 x1
#2: A 2012 y1
#3: B 2012 y2
#4: C 2015 x4
#5: C 2012 y3
data
DF1 <- structure(list(Name = c("A", "A", "B", "C", "C"), Year = c(2010L,
2012L, 2012L, 2015L, 2012L), Val1 = c("x1", "x2", "x3", "x4",
"x5")), class = "data.frame", row.names = c(NA, -5L))
DF2 <- structure(list(Name = c("A", "B", "C"), Year = c(2012L, 2012L,
2012L), Val1 = c("y1", "y2", "y3")), class = "data.frame", row.names = c(NA,
-3L))
I think the easiest way to do this is to filter DF2 down and then append it to DF1.
So
DF2 <- dplyr::filter(DF2, Year==2012,
Name %in% unique(DF1$Name)
DF1 <- dplyr::bind_rows(DF1, DF2)
Here are two base R solutions.
- Using match:
inds <- match(data.frame(t(DF2[-3]),stringsAsFactors = FALSE),
data.frame(t(DF1[-3]),stringsAsFactors = FALSE))
DF1$Val1[inds] <- DF2$Val1
such that
> DF1
Name Year Val1
1 A 2010 x1
2 A 2012 y1
3 B 2012 y2
4 C 2015 x4
5 C 2012 y3
- Using merge + subset:
DF1 <- subset(within(merge(DF1,DF2,by=c("Name","Year"),all.x = TRUE),
Val1 <- ifelse(is.na(Val1.y),Val1.x,Val1.y)),
select = names(DF1))
such that
> DF1
Name Year Val1
1 A 2010 x1
2 A 2012 y1
3 B 2012 y2
4 C 2012 y3
5 C 2015 x4
We can left_join df1 and df2 on Name and Year and use coalesce to select non-NA values from the two Val1 columns.
library(dplyr)
DF1 %>%
left_join(DF2, by = c('Name', 'Year')) %>%
mutate(Val1 = coalesce(Val1.y, Val1.x)) %>%
select(names(df1))
# Name Year Val1
#1 A 2010 x1
#2 A 2012 y1
#3 B 2012 y2
#4 C 2015 x4
#5 C 2012 y3

Count number of occurences for every column in dataframe

I have a dataframe with an unknown amount of columns (it can change frequently) and I need to count the number of observations for a given ID and year for every column and create a costum "n" column for each column of my dataframe telling me how many observations were made for that specific column.
I have tried:
library(dplyr)
count <- tally(group_by(final_database,ID,Year))
But that will count unique combinations of ID + Year. While I need to know how many times over the years my ID was observed for each characteristic. Example:
ID Year CHAR1 n_CHAR1
A 2016 0 3
A 2017 5 3
A 2018 2 3
A 2019 3
B 2016 1 2
B 2017 2
B 2018 2
B 2019 1 2
And so on for all characteristics. I would insert the "n_CHAR" columns to the original dataframe.
It doesn't need to be tidy.
Thanks!
Try:
transform(final_database, n_CHAR1 = ave(CHAR1, ID, FUN = function(x) sum(x != "")))
If the blank rows are actually NA, then just replace sum(x != "") with sum(!is.na(x)).
Edit:
If you'd need multiple n columns for multiple NCHAR columns, you could do:
library(dplyr)
final_database %>%
group_by(ID) %>%
mutate_at(vars(starts_with("CHAR")),
list(n = ~ sum(. != "")))
This example assumes that all the relevant NCHAR columns start with the string NCHAR (e.g. NCHAR1, NCHAR2, NCHAR3, etc.).
If the columns you're referring to are 3rd to last, then you can do:
library(dplyr)
finalDatabase <- final_database %>%
group_by(ID) %>%
mutate_at(vars(3:ncol(.)), # If you don't have many other vars except NCHAR, you can also do vars(-ID, -Year) as suggested by #camille
list(n = ~ sum(. != ""))) %>%
select(ID, Year, ends_with("_n"))
We can also do this with data.table:
library(data.table)
setDT(df)[, n_CHAR1 := sum(CHAR1 != ""), by = "ID"]
Output:
ID Year CHAR1 n_CHAR1
1: A 2016 0 3
2: A 2017 5 3
3: A 2018 2 3
4: A 2019 3
5: B 2016 1 2
6: B 2017 2
7: B 2018 2
8: B 2019 1 2
Data:
df <- structure(list(ID = c("A", "A", "A", "A", "B", "B", "B", "B"),
Year = c(2016L, 2017L, 2018L, 2019L, 2016L, 2017L, 2018L,
2019L), CHAR1 = c("0", "5", "2", "", "1", "", "", "1")), row.names = c(NA,
-8L), class = "data.frame")

Resources