I have a dataframe like this:
ID S1 C
1 1 2 3
2 1 2 3
3 3 1 1
4 6 2 5
5 6 7 5
What I need is the number of rows per group ID where S1 <= C. This is the desired output.
ID Obs
1 1 2
2 3 1
3 6 1
Even though the question was answered below, I have a follow up question: Is it possible to do the same for multiple columns (S1, S2, ..). For example for the dataframe below:
ID S1 S2 C
1 1 2 2 3
2 1 2 2 3
3 3 1 1 1
4 6 2 2 5
5 6 7 7 5
And then get:
ID S1.Obs S2.Obs
1 1 2 2
2 3 1 1
3 6 1 1
A base R solution with aggregate().
aggregate(Obs ~ ID, transform(df, Obs = S1 <= C), sum)
# ID Obs
# 1 1 2
# 2 3 1
# 3 6 1
A dplyr solution
library(dplyr)
df %>%
filter(S1 <= C) %>%
count(ID, name = "Obs")
# ID Obs
# 1 1 2
# 2 3 1
# 3 6 1
Data
df <- structure(list(ID = c(1L, 1L, 3L, 6L, 6L), S1 = c(2L, 2L, 1L, 2L, 7L),
C = c(3L, 3L, 1L, 5L, 5L)), class = "data.frame", row.names = c("1", "2", "3", "4", "5"))
Extension
If you want to apply this rule on multiple columns such as S1, S2, S3:
df %>%
group_by(ID) %>%
summarise(across(starts_with("S"), ~ sum(.x <= C)))
data <- data.frame(
ID = c(1, 1, 3, 6, 6),
S1 = c(2, 2, 1, 2, 7),
C = c(3, 3, 1, 5, 5)
)
library(dplyr)
data.filtered <- data[data$S1 <= data$C,]
data.filtered %>% group_by(ID) %>%
summarize(Obs = length(ID))
An option with data.table
library(data.table)
setDT(df)[S1 <=C, .(Obs = .N), ID]
# ID Obs
#1: 1 2
#2: 3 1
#3: 6 1
data
df <- structure(list(ID = c(1L, 1L, 3L, 6L, 6L), S1 = c(2L, 2L, 1L, 2L, 7L),
C = c(3L, 3L, 1L, 5L, 5L)), class = "data.frame", row.names = c("1", "2", "3", "4", "5"))
Related
I have this list of dataframes created as follows :
df = data.frame(x = c(1,0,0,0,1,1,1,NA), y = c(2,2,2,2,3,3,2,NA),
z = c(1:7,NA), m = c(1,2,3,1,2,3,1,NA) )
df$x = factor(df$x)
df$y = factor(df$y)
df$m = factor(df$m)
l1 = list(df$x,df$y,df$m)
l2 = lapply(l1,table)
l3 = lapply(l2,as.data.frame)
l3
The output is as follows :
[[1]]
Var1 Freq
1 0 3
2 1 4
[[2]]
Var1 Freq
1 2 5
2 3 2
[[3]]
Var1 Freq
1 1 3
2 2 2
3 3 2
I wish that the names of the variables from the dataframe are assigned autmatically to the l3 list elements. For example : Var1 from list 1 becomes x. Var1 from list 2 becomes y. Var1 from list 3 becomes m. Thanks
Using Map.
l3 <- lapply(df, table) |> lapply(as.data.frame)
(l3 <- Map(\(x, y) {names(x)[1] <- y; x}, l3, names(l3)))
# $x
# x Freq
# 1 0 3
# 2 1 4
#
# $y
# y Freq
# 1 2 5
# 2 3 2
#
# $z
# z Freq
# 1 1 1
# 2 2 1
# 3 3 1
# 4 4 1
# 5 5 1
# 6 6 1
# 7 7 1
#
# $m
# m Freq
# 1 1 3
# 2 2 2
# 3 3 2
Data:
df <- structure(list(x = structure(c(2L, 1L, 1L, 1L, 2L, 2L, 2L, NA
), levels = c("0", "1"), class = "factor"), y = structure(c(1L,
1L, 1L, 1L, 2L, 2L, 1L, NA), levels = c("2", "3"), class = "factor"),
z = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, NA), m = structure(c(1L,
2L, 3L, 1L, 2L, 3L, 1L, NA), levels = c("1", "2", "3"), class = "factor")), row.names = c(NA,
-8L), class = "data.frame")
One possible solution:
Map(\(x,y) setNames(x, c(y,"Freq")), l3, c("x", "y", "z"))
[[1]]
x Freq
1 0 3
2 1 4
[[2]]
y Freq
1 2 5
2 3 2
[[3]]
z Freq
1 1 3
2 2 2
3 3 2
This question already has answers here:
Counting unique / distinct values by group in a data frame
(12 answers)
Closed 2 years ago.
I have a dataset, which i define for example like this:
type <- c(1,1,1,2,2,2,2,2,3,3,4,4,5)
val <- c(4,1,1,2,8,2,3,2,3,3,4,4,5)
tdt <- data.frame(plu, occur)
So it looks like this:
type val
1 4
1 1
1 1
2 2
2 8
2 2
2 3
2 2
3 3
3 3
4 4
4 4
5 5
5 7
I want to find how many unique vals each type gets (turnover). So desired result is:
type turnover
1 2
2 3
3 1
4 1
5 2
How could i get it? How this function should look like? I know how to count occurrences of each type, but not with each unique val
With n_distinct, we can get the number of unique elements grouped by 'type'
library(dplyr)
tdt %>%
group_by(type) %>%
summarise(turnover = n_distinct(val))
# A tibble: 5 x 2
# type turnover
# <int> <int>
#1 1 2
#2 2 3
#3 3 1
#4 4 1
#5 5 2
Or with distinct and count
tdt %>%
distinct() %>%
count(type)
# type n
#1 1 2
#2 2 3
#3 3 1
#4 4 1
#5 5 2
Or using uniqueN from data.table
library(data.table)
setDT(tdt)[, .(turnover = uniqueN(val)), type]
Or with table in base R after getting the unique rows
table(unique(tdt)$type)
data
tdt <- structure(list(type = c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L,
4L, 4L, 5L, 5L), val = c(4L, 1L, 1L, 2L, 8L, 2L, 3L, 2L, 3L,
3L, 4L, 4L, 5L, 7L)), class = "data.frame", row.names = c(NA,
-14L))
Another base R option is using aggregate
tdtout <- aggregate(val~.,tdt,function(v) length(unique(v)))
such that
> tdtout
type val
1 1 2
2 2 3
3 3 1
4 4 1
5 5 2
data
> dput(tdt)
structure(list(type = c(1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 4, 4, 5,
5), val = c(4, 1, 1, 2, 8, 2, 3, 2, 3, 3, 4, 4, 5, 7)), class = "data.frame", row.names = c(NA,
-14L))
I am converting old base R code into tidyverse and could use some help. I want to reverse code some vars in df1 conditional on the variable being tagged as positive==1 in a lookup table df2. Here's my base R solution:
library(tidyverse)
set.seed(1)
df1 <- data.frame(item1 = sample(1:4, 10, replace = TRUE),
item2 = sample(1:4, 10, replace = TRUE),
item3 = sample(1:4, 10, replace = TRUE))
df1
# item1 item2 item3
# 1 2 1 4
# 2 2 1 1
# 3 3 3 3
# 4 4 2 1
# 5 1 4 2
# 6 4 2 2
# 7 4 3 1
# 8 3 4 2
# 9 3 2 4
# 10 1 4 2
df2 <- data.frame(name = c("item1", "item2"),
positive = c(1, 0))
# name positive
# 1 item1 1
# 2 item2 0
vars <- c("item1", "item2")
# reverse code if positive==1
# 4=1, 3=2, 2=3, 1=4
for (i in vars) {
if (df2$positive[df2$name==i]==1) {
df1[i] <- 4 - df1[, i] + 1 # should reverse code item1
}
}
df1
# item1 item2 item3
# 1 3 1 4
# 2 3 1 1
# 3 2 3 3
# 4 1 2 1
# 5 4 4 2
# 6 1 2 2
# 7 1 3 1
# 8 2 4 2
# 9 2 2 4
# 10 4 4 2
We can use mutate_at where we specify the vars by subsetting the 'name' column based on the binary values of 'positive' converted to logical and subtract 4 from the column
library(dplyr)
dfn <- df1 %>%
mutate_at(vars(intersect(names(.),
as.character(df2$name)[as.logical(df2$positive)])), ~ 4 - . + 1)
dfn
# item1 item2 item3
#1 3 1 4
#2 3 1 1
#3 2 3 3
#4 1 2 1
#5 4 4 2
#6 1 2 2
#7 1 3 1
#8 2 4 2
#9 2 2 4
#10 4 4 2
Or with base R
vars1 <- with(df2, as.character(name[as.logical(positive)]))
df1[vars1] <- lapply(df1[vars1], function(x) 4 - x + 1)
data
df1 <- structure(list(item1 = c(2L, 2L, 3L, 4L, 1L, 4L, 4L, 3L, 3L,
1L), item2 = c(1L, 1L, 3L, 2L, 4L, 2L, 3L, 4L, 2L, 4L), item3 = c(4L,
1L, 3L, 1L, 2L, 2L, 1L, 2L, 4L, 2L)), class = "data.frame",
row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10"))
I would like to find a way to do very similar to this question.
Increment by 1 for every change in column
But i want to restart the counter when var1 = c
using
df$var2 <- with(rle(as.character(df$var1)), rep(seq_along(values), lengths))*
results in column var 2
var1 var2 Should be
a 1 1
a 1 1
1 2 2
0 3 3
b 4 4
b 4 4
b 4 4
c 5 1
1 6 2
1 6 2
In data.table you can use rleid to get a run-length-id for var1 within each group.
library(data.table)
setDT(df)
df[, var2 := rleid(var1), by = cumsum(var1 == "c")]
df
# var1 var2
# 1: a 1
# 2: a 1
# 3: 1 2
# 4: 0 3
# 5: b 4
# 6: b 4
# 7: b 4
# 8: c 1
# 9: 1 2
#10: 1 2
and using dplyr
library(dplyr)
df %>%
group_by(group = cumsum(var1 == "c")) %>%
mutate(var2 = cumsum(var1 != lag(var1, default = first(var1))) + 1)
data
df <- structure(list(var1 = structure(c(3L, 3L, 2L, 1L, 4L, 4L, 4L,
5L, 2L, 2L), .Label = c("0", "1", "a", "b", "c"), class = "factor")),
class = "data.frame", row.names = c(NA, -10L))
We can use the OP's code with rle in base R with ave
df$var2 <- with(df, as.integer(ave(as.character(var1), cumsum(var1 == 'c'),
FUN = function(x) with(rle(x), rep(seq_along(values), lengths)))))
df$var2
#[1] 1 1 2 3 4 4 4 1 2 2
data
df <- structure(list(var1 = structure(c(3L, 3L, 2L, 1L, 4L, 4L, 4L,
5L, 2L, 2L), .Label = c("0", "1", "a", "b", "c"), class = "factor")),
class = "data.frame", row.names = c(NA,
-10L))
I am working with a gigantic person-period file and I thought that
a good way to deal with a large dataset is by using sampling and re-sampling technique.
My person-period file look like this
id code time
1 1 a 1
2 1 a 2
3 1 a 3
4 2 b 1
5 2 c 2
6 2 b 3
7 3 c 1
8 3 c 2
9 3 c 3
10 4 c 1
11 4 a 2
12 4 c 3
13 5 a 1
14 5 c 2
15 5 a 3
I have actually two distinct issues.
The first issue is that I am having trouble in simply sampling a person-period file.
For example, I would like to sample 2 id-sequences such as :
id code time
1 a 1
1 a 2
1 a 3
2 b 1
2 c 2
2 b 3
The following line of code is working for sampling a person-period file
dt[which(dt$id %in% sample(dt$id, 2)), ]
However, I would like to use a dplyr solution because I am interested in resampling and in particular I would like to use replicate.
I am interested in doing something like replicate(100, sample_n(dt, 2), simplify = FALSE)
I am struggling with the dplyr solution because I am not sure what should be the grouping variable.
library(dplyr)
dt %>% group_by(id) %>% sample_n(1)
gives me an incorrect result because it does not keep the full sequence of each id.
Any clue how I could both sample and re-sample person-period file ?
data
dt = structure(list(id = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L,
3L, 4L, 4L, 4L, 5L, 5L, 5L), .Label = c("1", "2", "3", "4", "5"
), class = "factor"), code = structure(c(1L, 1L, 1L, 2L, 3L,
2L, 3L, 3L, 3L, 3L, 1L, 3L, 1L, 3L, 1L), .Label = c("a", "b",
"c"), class = "factor"), time = structure(c(1L, 2L, 3L, 1L, 2L,
3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), .Label = c("1", "2",
"3"), class = "factor")), .Names = c("id", "code", "time"), row.names = c(NA,
-15L), class = "data.frame")
I think the idiomatic way would probably look like
set.seed(1)
samp = df %>% select(id) %>% distinct %>% sample_n(2)
left_join(samp, df)
id code time
1 2 b 1
2 2 c 2
3 2 b 3
4 5 a 1
5 5 c 2
6 5 a 3
This extends straightforwardly to more grouping variables and fancier sampling rules.
If you need to do this many times...
nrep = 100
ng = 2
samps = df %>% select(id) %>% distinct %>%
slice(rep(1:n(), nrep)) %>% mutate(r = rep(1:nrep, each = n()/nrep)) %>%
group_by(r) %>% sample_n(ng)
repdat = left_join(samps, df)
# then do stuff with it:
repdat %>% group_by(r) %>% do_stuff
I imagine you are doing some simulations and may want to do the subsetting many times. You probably also want to try this data.table method and utilize the fast binary search feature on the key column:
library(data.table)
setDT(dt)
setkey(dt, id)
replicate(2, dt[list(sample(unique(id), 2))], simplify = F)
#[[1]]
# id code time
#1: 3 c 1
#2: 3 c 2
#3: 3 c 3
#4: 5 a 1
#5: 5 c 2
#6: 5 a 3
#[[2]]
# id code time
#1: 3 c 1
#2: 3 c 2
#3: 3 c 3
#4: 4 c 1
#5: 4 a 2
#6: 4 c 3
We can use filter with sample
dt %>%
filter(id %in% sample(unique(id),2, replace = FALSE))
NOTE: The OP specified using dplyr method and this solution does uses the dplyr.
If we need to do replicate one option would be using map from purrr
library(purrr)
dt %>%
distinct(id) %>%
replicate(2, .) %>%
map(~sample(., 2, replace=FALSE)) %>%
map(~filter(dt, id %in% .))
#$id
# id code time
#1 1 a 1
#2 1 a 2
#3 1 a 3
#4 4 c 1
#5 4 a 2
#6 4 c 3
#$id
# id code time
#1 4 c 1
#2 4 a 2
#3 4 c 3
#4 5 a 1
#5 5 c 2
#6 5 a 3