I'm trying to count the frequency of dates matching in a data frame.
df1 <- data.frame(c('1991-01-09', '1991-01-11', '1991-02-17'))
df2 <- data.frame(c('1991-01-09', '1991-01-09', '1991-02-17'))
The result would be the following:
Date Freq
1991-01-09 2
1991-01-11 0
1991-02-17 1
df1$count <- rowSums(outer(df1$d, df2$d, `==`))
df1
# d count
# 1 1991-01-09 2
# 2 1991-01-11 0
# 3 1991-02-17 1
Data
df1 <- structure(list(d = c("1991-01-09", "1991-01-11", "1991-02-17")), row.names = c(NA, -3L), class = "data.frame")
df2 <- structure(list(d = c("1991-01-09", "1991-01-09", "1991-02-17")), class = "data.frame", row.names = c(NA, -3L))
Using sapply :
stack(sapply(df1$col1, function(x) sum(df2$col2 == x)))
# values ind
#1 2 1991-01-09
#2 0 1991-01-11
#3 1 1991-02-17
or you could use purrr::map_dbl()
data.frame("Date" = df1[, 1],
"Freq" = purrr::map_dbl(df1[, 1], ~sum(.x == df2[, 1]))
)
Date Freq
1 1991-01-09 2
2 1991-01-11 0
3 1991-02-17 1
Related
I have a dataframe with count information (df1)
rownames
sample1
sample2
sample3
m1
0
5
1
m2
1
7
5
m3
6
2
0
m4
3
1
0
and a second with sample information (df2)
rownames
batch
total count
sample1
a
10
sample2
b
15
sample3
a
6
I also have two lists with information about the m values (could easily be turned into another data frame if necessary but I would rather not add to the count information as it is quite large). No patterns (such as even and odd) exist, I am just using a very simplistic example
x <- c("m1", "m3") and y <- c("m2", "m4")
What I would like to do is add another two columns to the sample information. This is a count of each m per sample that has a value of above 5 and appears in list x or y
rownames
batch
total count
x
y
sample1
a
10
1
0
sample2
b
15
1
1
sample3
a
6
0
1
My current strategy is to make a list of values for both x and y and then append them to df2. Here are my attempts so far:
numX <- colSums(df1[sum(rownames(df1)>10 %in% x),]) and numX <- colSums(df1[sum(rownames(df1)>10 %in% x),]) both return a list of 0s
numX <- colSums(df1[rownames(df1)>10 %in% x,]) returns a list of the sum of count values meeting the conditions for each column
numX <- length(df1[rownames(df1)>10 %in% novel,]) returns the number of times the condition is met (in this example 2L)
I am not really sure how to approach this so I have just been throwing around attempts. I've tried looking for answers but maybe I am just struggling to find the proper wording.
We may do this with rowwise
library(dplyr)
df2 %>%
rowwise %>%
mutate(x = +(sum(df1[[rownames]][df1$rownames %in% x]) >= 5),
y = +(sum(df1[[rownames]][df1$rownames %in% y]) >= 5)) %>%
ungroup
-output
# A tibble: 3 × 5
rownames batch totalcount x y
<chr> <chr> <int> <int> <int>
1 sample1 a 10 1 0
2 sample2 b 15 1 1
3 sample3 a 6 0 1
Or based on the data, a base R option would be
out <- aggregate(. ~ grp, FUN = sum,
transform(df1, grp = c('x', 'y')[1 + (rownames %in% y)] )[-1])
df2[out$grp] <- +(t(out[-1]) >= 5)
-output
> df2
rownames batch totalcount x y
1 sample1 a 10 1 0
2 sample2 b 15 1 1
3 sample3 a 6 0 1
data
df1 <- structure(list(rownames = c("m1", "m2", "m3", "m4"), sample1 = c(0L,
1L, 6L, 3L), sample2 = c(5L, 7L, 2L, 1L), sample3 = c(1L, 5L,
0L, 0L)), class = "data.frame", row.names = c(NA, -4L))
df2 <- structure(list(rownames = c("sample1", "sample2", "sample3"),
batch = c("a", "b", "a"), totalcount = c(10L, 15L, 6L)),
class = "data.frame", row.names = c(NA,
-3L))
How about using using dplyr and reshape2::melt
df3 <- df1 %>%
melt %>%
filter(value >= 5) %>%
mutate(x = as.numeric(rownames %in% c("m1", "m3")),
y = as.numeric(rownames %in% c("m2", "m4"))) %>%
select(-rownames, - value) %>%
group_by(variable) %>%
summarise(x = sum(x), y = sum(y))
df2 %>% left_join(df3, by = c("rownames" = "variable"))
rownames batch total_count x y
1 sample1 a 10 1 0
2 sample2 b 15 1 1
3 sample3 a 6 0 1
You can create a named list of vectors and for each rownames count how many values of x and y in the respective sample is >= 5.
Base R option -
list_vec <- list(x = x, y = y)
cbind(df2, do.call(rbind, lapply(df2$rownames, function(x)
sapply(list_vec, function(y) {
sum(df1[[x]][df1$rownames %in% y] >= 5)
}))))
# rownames batch total.count x y
#1 sample1 a 10 1 0
#2 sample2 b 15 1 1
#3 sample3 a 6 0 1
Using tidyverse -
library(dplyr)
library(purrr)
list_vec <- lst(x, y)
df2 %>%
bind_cols(map_df(df2$rownames, function(x)
map(list_vec, ~sum(df1[[x]][df1$rownames %in% .x] >= 5))))
This question already has answers here:
Merging a lot of data.frames [duplicate]
(1 answer)
How do I replace NA values with zeros in an R dataframe?
(29 answers)
Closed 2 years ago.
I want to merge the following 3 data frames and fill the missing values with -1. I think I should use the fct merge() but not exactly know how to do it.
> df1
Letter Values1
1 A 1
2 B 2
3 C 3
> df2
Letter Values2
1 A 0
2 C 5
3 D 9
> df3
Letter Values3
1 A -1
2 D 5
3 B -1
desire output would be:
Letter Values1 Values2 Values3
1 A 1 0 -1
2 B 2 -1 -1 # fill missing values with -1
3 C 3 5 -1
4 D -1 9 5
code:
> dput(df1)
structure(list(Letter = structure(1:3, .Label = c("A", "B", "C"
), class = "factor"), Values1 = c(1, 2, 3)), class = "data.frame", row.names = c(NA,
-3L))
> dput(df2)
structure(list(Letter = structure(1:3, .Label = c("A", "C", "D"
), class = "factor"), Values2 = c(0, 5, 9)), class = "data.frame", row.names = c(NA,
-3L))
> dput(df3)
structure(list(Letter = structure(c(1L, 3L, 2L), .Label = c("A",
"B", "D"), class = "factor"), Values3 = c(-1, 5, -1)), class = "data.frame", row.names = c(NA,
-3L))
You can get data frames in a list and use merge with Reduce. Missing values in the new dataframe can be replaced with -1.
new_df <- Reduce(function(x, y) merge(x, y, all = TRUE), list(df1, df2, df3))
new_df[is.na(new_df)] <- -1
new_df
# Letter Values1 Values2 Values3
#1 A 1 0 -1
#2 B 2 -1 -1
#3 C 3 5 -1
#4 D -1 9 5
A tidyverse way with the same logic :
library(dplyr)
library(purrr)
list(df1, df2, df3) %>%
reduce(full_join) %>%
mutate(across(everything(), replace_na, -1))
Here's a dplyr solution
df1 %>%
full_join(df2, by = "Letter") %>%
full_join(df3, by = "Letter") %>%
mutate_if(is.numeric, function(x) replace_na(x, -1))
output:
Letter Values1 Values2 Values3
<chr> <dbl> <dbl> <dbl>
1 A 1 0 -1
2 B 2 -1 -1
3 C 3 5 -1
4 D -1 9 5
I want perform join.
df1=structure(list(id = 1:3, group_id = c(10L, 20L, 40L)), class = "data.frame", row.names = c(NA,
-3L))
df2 has another structure, in group_id's field contain many groups. For examle {10,100,400}
so dput()
df2=structure(list(id = 1:3, group_id = structure(c(1L, 3L, 2L), .Label = c("{`10`,100,`40`}",
"{3,`40`,600,100}", "{4}"), class = "factor")), class = "data.frame", row.names = c(NA,
-3L))
df2 has group_id 10 and 40,but they are in braces together with other groups.
How get desired joined output
id group_id
1 10
1 40
3 40
You can clean group_id in df2 using gsub, bring each id in separate rows and filter.
library(dplyr)
df2 %>%
mutate(group_id = gsub('[{}`]', '', group_id)) %>%
tidyr::separate_rows(group_id) %>%
filter(group_id %in% df1$group_id)
# id group_id
#1 1 10
#2 1 40
#3 3 40
Here's a data.table alternative:
df2[, strsplit(gsub('[{}`]', '', group_id), ','), by = id][V1 %in% df1$group_id]
# id V1
#1: 1 10
#2: 1 40
#3: 3 40
here is an option with base R using regmatches/regexpr
subset(setNames(stack(setNames(regmatches(df2$group_id, gregexpr("\\d+", df2$group_id)),
df2$id))[2:1], c('id', 'group_id')), group_id %in% df1$group_id)
# id group_id
#1 1 10
#3 1 40
#6 3 40
I have a list of data frames, where each data frame has either 1 or 2 rows named "mis" or "syn" (form a column named cat) and a second col with a numeric frequency. I want to fill in each data frame such that if the "mis" row is missing, it adds a mis row with frequency = 0
and if "syn" row is missing, is adds a syn row with frequency 1:
###exmaple:
#example list of dataframes:
df1<- as.data.frame(cbind(cat = c("mis", "syn"), freq= c(4, 2)))
df2<- as.data.frame(cbind(cat = "mis", freq= 1))
df3<- as.data.frame(cbind(cat = "syn", freq= 2))
df_list<- list(df1 = df1, df2 = df2, df3= df3)
looks like:
> df_list
$df1
cat freq
1 mis 4
2 syn 2
$df2
cat freq
1 mis 1
$df3
cat freq
1 syn 2
Expected output:
> df_list
$df1
cat freq
mis 4
syn 2
$df2
cat freq
mis 1
syn 1
$df3
cat freq
syn 2
mis 0
what I've tried:
first I change the row names so that I can search by them
df_list_named<- lapply(df_list, function(x){ row.names(x)<-as.character(x$cat); x})
df_list_named
$df1
cat freq
mis mis 4
syn syn 2
$df2
cat freq
mis mis 1
$df3
cat freq
syn syn 2
then I've been trying to use an ifelse loop to append the rows to the the dataframes where it needs it, but I can't get it to work:
test<- lapply(df_list_named, function (x) ifelse(!row.names(df_list_named[[x]]) %in% "mis", rbind(df_list_named[[x]], c(cat = "mis", freq= 0)),
ifelse(!row.names(df_list_named[[x]]) %in% "syn", rbind(df_list_named[[x]], c(cat = "syn", freq= 1))))
Here is one way to do it with lapply
lapply(df_list, function(x) {
if(all(c("mis", "syn") %in% x$cat))
x
else if("mis" %in% x$cat)
rbind(x, data.frame(cat = "syn", freq = 1))
else
rbind(x, data.frame(cat = "mis", freq = 0))
})
#$df1
# cat freq
#1 mis 4
#2 syn 2
#$df2
# cat freq
#1 mis 1
#2 syn 1
#$df3
# cat freq
#1 syn 2
#2 mis 0
data
df1<- data.frame(cat = c("mis", "syn"), freq= c(4, 2), stringsAsFactors = FALSE)
df2<- data.frame(cat = "mis", freq= 1,stringsAsFactors = FALSE)
df3<- data.frame(cat = "syn", freq= 2, stringsAsFactors = FALSE)
df_list<- list(df1 = df1, df2 = df2, df3= df3)
You could use a "base" data frame, merge it with all data frames in the list using Map. The duplicated rows created in the already complete data frames can be safely excluded with !, as they are always placed at the end.
(base <- data.frame(cat=factor(c("syn", "mis")), freq=factor(1:0)))
# cat freq
# 1 syn 1
# 2 mis 0
Map(function(x) {y <- (merge(x, base, all=TRUE));y[!duplicated(y$cat), ]}, df_list)
# $df1
# cat freq
# 1 mis 4
# 3 syn 2
#
# $df2
# cat freq
# 1 mis 1
# 3 syn 1
#
# $df3
# cat freq
# 1 syn 2
# 3 mis 0
Data
df_list <- list(df1 = structure(list(cat = structure(1:2, .Label = c("mis",
"syn"), class = "factor"), freq = structure(2:1, .Label = c("2",
"4"), class = "factor")), class = "data.frame", row.names = c(NA,
-2L)), df2 = structure(list(cat = structure(c(cat = 1L), .Label = "mis", class = "factor"),
freq = structure(c(freq = 1L), .Label = "1", class = "factor")), class = "data.frame", row.names = c(NA,
-1L)), df3 = structure(list(cat = structure(c(cat = 1L), .Label = "syn", class = "factor"),
freq = structure(c(freq = 1L), .Label = "2", class = "factor")), class = "data.frame", row.names = c(NA,
-1L)))
This question already has answers here:
Merging a lot of data.frames [duplicate]
(1 answer)
Simultaneously merge multiple data.frames in a list
(9 answers)
Closed 5 years ago.
Here's my list of data frames:
[[1]]
ID Value
A 1
B 1
C 1
[[2]]
ID Value
A 1
D 1
E 1
[[3]]
ID Value
B 1
C 1
I'm after a single data frame with unique (non-redundant) IDs in the left hand column, replicates in columns, and NULL values as 0:
ID [1]Value [2]Value [3]Value
A 1 1 0
B 1 0 1
C 1 0 1
D 0 1 0
E 0 1 0
I've tried:
Reduce(function(x, y) merge(x, y, by=ID), datahere)
This provides a single list but without regards to where the original values come from, and duplicate IDs are repeated in new rows.
rbindlist(datahere, use.names=TRUE, fill=TRUE, idcol="Replicate")
This provides a single list with the [x]Value number as a new column called Replicate, but still it isn't in the structure I want as the ID column has redundancies.
What about something like this using dplyr/purrr:
require(tidyverse);
reduce(lst, full_join, by = "ID");
# ID Value.x Value.y Value
# 1 A 1 1 NA
# 2 B 1 NA 1
# 3 C 1 NA 1
# 4 D NA 1 NA
# 5 E NA 1 NA
Or with the NAs replaced with 0s:
reduce(lst, full_join, by = "ID") %>% replace(., is.na(.), 0);
# ID Value.x Value.y Value
#1 A 1 1 0
#2 B 1 0 1
#3 C 1 0 1
#4 D 0 1 0
#5 E 0 1 0
Sample data
options(stringsAsFactors = FALSE);
lst <- list(
data.frame(ID = c("A", "B", "C"), Value = c(1, 1, 1)),
data.frame(ID = c("A", "D", "E"), Value = c(1, 1, 1)),
data.frame(ID = c("B", "C"), Value = c(1, 1)))
You already have a nice answer but the typical way to do this is with tidyr::spread
Your data
A <- data.frame(ID=LETTERS[1:3], Value=1, stringsAsFactors=FALSE)
B <- data.frame(ID=LETTERS[c(1,4,5)], Value=1, stringsAsFactors=FALSE)
C <- data.frame(ID=LETTERS[c(2:3)], Value=1, stringsAsFactors=FALSE)
L <- list(A, B, C)
Solution
dplyr::bind_rows(L, .id="G") %>%
tidyr::spread(G, Value, fill=0)
# ID 1 2 3
# 1 A 1 1 0
# 2 B 1 0 1
# 3 C 1 0 1
# 4 D 0 1 0
# 5 E 0 1 0
With base R, we need to use all = TRUE in the merge
res <- Reduce(function(...) merge(..., all = TRUE, by="ID"), lst)
replace(res, is.na(res), 0)
# ID Value.x Value.y Value
#1 A 1 1 0
#2 B 1 0 1
#3 C 1 0 1
#4 D 0 1 0
#5 E 0 1 0
data
lst <- list(structure(list(ID = c("A", "B", "C"), Value = c(1, 1, 1)), .Names = c("ID",
"Value"), row.names = c(NA, -3L), class = "data.frame"), structure(list(
ID = c("A", "D", "E"), Value = c(1, 1, 1)), .Names = c("ID",
"Value"), row.names = c(NA, -3L), class = "data.frame"), structure(list(
ID = c("B", "C"), Value = c(1, 1)), .Names = c("ID", "Value"
), row.names = c(NA, -2L), class = "data.frame"))