Say that I have this data.frame, data:
data <- data.frame(val=c(rep(6,10), rep(7, 15), rep(8, 20), rep(9, 25), rep(10, 100), rep(11, 20), rep(12, 15), rep(13, 10)))
data$plus <- data$val + 100
My goal is to create a new data.frame that has the frequencies of each val, and the associated plus value.
My current strategy is to create a table (called table), then merge the frequencies. Then to keep only the first observation within each group:
table <- table(data$val)
df1 <- data.frame(val = as.integer(names(table)[1:length(table)]), N = table[1:length(table)])
df2 <- merge(data, df1)
df3 <- do.call(rbind, by(df2, list(df2$val), FUN=function(x) head(x, 1)))
This works, but it seems clunky.
In Stata, for example, it would be less and simpler code. Something like:
bys val plus: egen max = _N
bys val plus: gen first = _n==1
keep if first==1
Is there a way to simplify or make more elegant the R code?
Here's an approach using "data.table":
library(data.table)
as.data.table(data)[, N := .N, by = val][, .SD[1], by = val]
# val plus N
# 1: 6 106 10
# 2: 7 107 15
# 3: 8 108 20
# 4: 9 109 25
# 5: 10 110 100
# 6: 11 111 20
# 7: 12 112 15
# 8: 13 113 10
## Or (#RicardoSaporta)
as.data.table(data)[, list(.N, plus=plus[1]), by = val]
## Or (#DavidArenburg)
unique(as.data.table(data)[, N := .N, by = val], by = "val")
With "dplyr", you can try:
library(dplyr)
data %>%
group_by(val) %>%
mutate(N = n()) %>%
slice(1)
In base R, I guess you can try something like:
do.call(rbind, lapply(split(data, data$val),
function(x) cbind(x, N = nrow(x))[1, ]))
Edited
Or you can use aggregate()
data$N = 0
out = aggregate(N ~ val + plus, data = data, length)
or else
out = aggregate(plus ~val, data = data,function(x) c(unique(x), N = length(x)))
do.call(data.frame, out)
or using ddply
library(plyr)
out = ddply(data, .(val,plus), summarize, N = length(plus))
#> out
# val plus N
#1 6 106 10
#2 7 107 15
#3 8 108 20
#4 9 109 25
#5 10 110 100
#6 11 111 20
#7 12 112 15
#8 13 113 10
Related
I have a unique problem where I would like to add a column of percentiles for each group in a data frame. Here is how my data look like:
library(tidyverse)
set.seed(123)
df <- tibble(id = 1:100,
group = rep(letters[1:4], 25),
x = c(sample(1:100, 25, replace = T),
sample(101:200, 25, replace = T),
sample(201:300, 25, replace = T),
sample(301:400, 25, replace = T)))
> df
# A tibble: 100 x 3
id group x
<int> <chr> <int>
1 1 a 78
2 2 b 80
3 3 c 7
4 4 d 100
5 5 a 45
6 6 b 76
7 7 c 25
8 8 d 91
9 9 a 13
10 10 b 84
# ... with 90 more rows
# Function to create a table ten percentiles for a numeric vector
percentiles_table <- function(x) {
res <- round(quantile(x, probs = seq(from=.1, to=1, by=0.1)), 0)
res <- data.frame(percentile = names(res), to = res )
res <- res %>%
mutate(from = lag(to, default = 0)) %>%
select(from,to,percentile)
}
# Table of percentiles
percentiles <- df %>%
group_by(group) %>%
summarise(percentiles_table(x)) %>%
ungroup()
> percentiles
# A tibble: 40 x 4
group from to percentile
<chr> <dbl> <dbl> <chr>
1 a 0 25 10%
2 a 25 71 20%
3 a 71 106 30%
4 a 106 125 40%
5 a 125 198 50%
6 a 198 236 60%
7 a 236 278 70%
8 a 278 325 80%
9 a 325 379 90%
10 a 379 389 100%
I would like to add the percentile column to df for each group where the value of x falls between from and to.
There might be some way to calculate the percentile column directly without having it calculated in a separated data.frame and then appending it back to df.
A one-liner with my santoku package:
library(santoku)
df |>
group_by(group) |>
mutate(
percentile = chop_quantiles(x, 0:100/100,
labels = lbl_endpoint())
)
# A tibble: 100 × 4
# Groups: group [4]
id group x percentile
<int> <chr> <int> <fct>
1 1 a 35 8%
2 2 b 97 20%
3 3 c 39 4%
4 4 d 20 8%
5 5 a 89 16%
...
Using data.table:
setDT(df)[
,
percentile := cut(
x,
quantile(x, seq(0, 1, 0.1)),
include.lowest = TRUE,
labels = paste0(seq(10, 100, 10), "%")
),
by = group
]
install.packages("zoo")
library(zoo)
y=as.data.frame(c(0:max(percentiles$to)))
y=merge(y,unique(percentiles[,c(1)]))
y=merge(y,percentiles[,c(1,2,4)], by.x = c("group","c(0:max(percentiles$to))"), by.y = c("group","from"), all.x = TRUE)
y=na.locf(y)
df=merge(df,y, all.x = TRUE, by.x = c("group","x"), by.y = c("group","c(0:max(percentiles$to))"))
I got this working solution.
percentile_ranks <- function(x) {
res <- trunc(rank(x))/length(x) * 100
res <- floor(res/10) }
df <- df %>%
group_by(group) %>%
arrange(x) %>%
mutate(percentile = percentile_ranks(x)) %>%
mutate(percentile_pct = paste0(percentile*10,"%")) %>%
ungroup() %>%
arrange(id) # original data.frame order
I want to compile this data frame into a dataframe with only one row for each id, for each column value1, value2, value3, I want to take the value for A- B. I have tried to make a for loop to run over the ids, but I dont know how to continue, any advice? It would be really helpful with a function to run over many many columns, this is just an example of 3 columns. Thanks!
df <- data.frame(
id=rep(1:5,each=2),
trt = rep(c("A","B","A","B", "B","A", "B","A","A","B"),),
value1 = sample(1:100,10),
value2 = sample(1:100,10),
value3 = sample(1:100,10),
stringsAsFactors = F)
> df
id trt value1 value2 value3
1 A 98 64 91
1 B 64 48 16
2 A 26 31 87
2 B 93 62 24
3 B 32 37 71
3 A 83 6 92
4 B 82 68 36
4 A 30 5 66
5 A 87 29 48
5 B 86 47 82
for (i in length(id)) {
d <- df[df$id==i,]
d$value1[d$trt=="A"]- d$value1[d$trt=="B"]
print(i)
}
Here is a base R solution with the split/lapply/combine strategy. See Hadley Wickham's paper.
cols <- grep("value", names(df), value = TRUE)
sp <- split(df[c("trt", cols)], df$id)
res <- lapply(seq_along(sp), function(i){
X <- sp[[i]][order(sp[[i]]$trt), ]
Y <- X[1, cols] - X[2, cols]
cbind(data.frame(id = names(sp)[i]), Y)
})
res <- do.call(rbind, res)
res
# id value1 value2 value3
#1 1 -6 63 28
#3 2 -32 8 -18
#6 3 6 -7 -63
#8 4 -67 -45 2
#9 5 1 20 45
Your Data:
set.seed(42)
df <- data.frame(
id=rep(1:5,each=2),
trt = rep(c("A","B","A","B", "B","A", "B","A","A","B"),),
value1 = sample(1:100,10),
value2 = sample(1:100,10),
value3 = sample(1:100,10),
stringsAsFactors = F)
Solution:
library(tidyr)
library(dplyr)
df %>%
gather(key, value, -trt, -id) %>%
mutate(value = if_else(trt == "B", -value, value)) %>%
group_by(id, key) %>%
summarise(value = sum(value)) %>%
ungroup() %>%
spread(key, value)
The point here is to, at first, transform with gather the dataframe into long form and put all your value in one column.
Seen that values with B must be subtracted, I set values with B as negative.
At that point I'm free to some all values by id and key (which is equal to the label value1, value2, value3).
I can return to the initial configuration of the dataframe with one column per each "value" with spread.
It doesn't matter how many "valueN" columns you have.
You might also want to try the data.tablepackage (using the same reproducible data as #Edo):
library(data.table)
setDT(df, key = c("id","trt"))
df[, lapply(.SD, function(x) diff(x) * -1), by = id, .SDcols = -2]
The result:
id value1 value2 value3
1: 1 -16 17 -58
2: 2 -49 23 -55
3: 3 82 48 -18
4: 4 -23 9 13
5: 5 -18 -79 -7
Alternatively, you could also do:
library(data.table)
setorder(df, id, -trt)
setDT(df)[, lapply(.SD, diff), by = id, .SDcols = -2]
I am having an issue multiplying 3 columns by 3 different constants (i.e, 2,3,4, respectively) and then summing each row after applying the conversion.
I am using dplyr
variable <- df %>% transmute(df, sum(col1, col2*2, col3*3, col4*4))
We could do
library(dplyr)
df %>%
mutate(a = a * 2,
b = b * 3,
c = c * 4,
total = a + b + c)
# a b c total
#1 2 18 44 64
#2 4 21 48 73
#3 6 24 52 82
#4 8 27 56 91
#5 10 30 60 100
Using rowSums
df %>%
mutate(a = a * 2,
b = b * 3,
c = c * 4) %>%
mutate(total = rowSums(.))
Important to note that if we are using rowSums, we need to include it in the new mutate call and not the same one otherwise it would sum the original df and not the changed one.
Or in base R
df1 <- transform(df, a = a*2, b = b * 3, c = c *4)
df1$total <- rowSums(df1)
data
df <- data.frame(a = 1:5, b = 6:10, c = 11:15)
In base R, we can do this more compactly with %*%
df$total <- c(as.matrix(df) %*% 2:4)
df
# a b c total
#1 1 6 11 64
#2 2 7 12 73
#3 3 8 13 82
#4 4 9 14 91
#5 5 10 15 100
Or with crossprod
df$total <- c(crossprod(t(df), 2:4))
--
Or with tidyverse
library(tidyverse)
map2(df, 2:4, ~ .x * .y) %>%
reduce(`+`) %>%
bind_cols(df, total = .)
data
df <- data.frame(a = 1:5, b = 6:10, c = 11:15)
variable <- df %>%
rowwise() %>%
mutate(new_var = sum(col1, col2*2, col3*3, col4*4))
Try that instead.
add rowwise() to have data analyzed at each row
use mutate() to get the new calculation
I am trying to test for a condition across a range of columns. The data looks something like this
Name DPD_1 DPD_2 DPD_3 Default_flag
1: A 46 63 138 TRUE
2: B 12 82 33 FALSE
3: C 95 71 55 TRUE
4: D 57 133 116 TRUE
5: E 48 27 137 TRUE
in the code I need to test if any of DPD_1, DPD_2 or DPD_3 are greater than 90, in which case the Default_flag gets set to TRUE.
The code I am using for this is given below
df1 <- data.table(Name = LETTERS[1:10],DPD_1 = sample(1:100,10),DPD_2 = sample(1:200,10),DPD_3 = sample(1:200,10) )
df1[,Default_flag := ifelse((DPD_1>=90 | DPD_2>=90 | DPD_3>=90 ),TRUE,FALSE)]
Now the problem is with some datasets I need to increase the DPD checks from DPD_1 to say DPD_24 (checking for 24 columns, instead of just 3 in the current example). Is there anyway i can avoid specifying each DPDnumber in the ifelse statement. I am happy to lose the ifelse statement and if some version of apply can work, i would be happy to use that too.
We can use Reduce with | after specifying the columns of interest in .SDcols
df1[, Default_flag := Reduce(`|`, lapply(.SD, `>=`, 90)), .SDcols = DPD_1:DPD_3]
Update
Based on the OP's comment, if we need to create a function to automatically detect the column names, then use grep to get the column names based on a pattern. The function below takes the datasets, a pattern ('pat'), a value to compare ('val') and 'n' i.e. the number of columns of a particular pattern
f1 <- function(dat, pat, val, n){
tmp <- as.data.table(dat)
nm1 <- head(grep(pat, names(tmp), value = TRUE), n)
tmp[, Default_flag := Reduce(`|`,lapply(.SD, `>=`, val)), .SDcols = nm1][]
}
f1(df1, "DPD", 90, 2)
f1(df1, "DPD", 90, 3)
As per #aelwan's request an option using tidyverse would be
library(tidyverse)
f2 <- function(dat, pat, val, n){
pat <- quo_name(enquo(pat))
nm1 <- head(grep(pat, names(dat), value = TRUE), n)
dat %>%
mutate_at(vars(nm1), funs(.>= val)) %>%
select_at(nm1) %>%
reduce(`|`) %>%
mutate(dat, Default_flag = .)
}
f2(df1, DPD, 90, 2)
f2(df1, DPD, 90, 3)
identical(f1(df1, "DPD", 90, 2), as.data.table(f2(df1, DPD, 90, 2)))
#[1] TRUE
identical(f1(df1, "DPD", 90, 3), as.data.table(f2(df1, DPD, 90, 3)))
#[1] TRUE
Another alternative to #akrun's accepted answer is a tidyverse solution which makes use of the tidyr::gather function (as suggested by #r2evans in the comments).
The relevant variable names are detected using dplyr::starts_with rather than grep.
suppressPackageStartupMessages(library(tidyverse))
set.seed(12345)
df1 <- tibble(Name = LETTERS[1:10],
DPD_1 = sample(1:100,10),
DPD_2 = sample(1:200,10),
DPD_3 = sample(1:200,10))
df1 %>%
select(Name, starts_with("DPD_")) %>%
gather(DPD_name, DPD_value, -Name) %>%
group_by(Name) %>%
summarise(Default_Flag = any(DPD_value > 90)) %>%
{ left_join(df1, ., by = "Name") }
#> # A tibble: 10 x 5
#> Name DPD_1 DPD_2 DPD_3 Default_Flag
#> <chr> <int> <int> <int> <lgl>
#> 1 A 73 7 91 TRUE
#> 2 B 87 31 66 FALSE
#> 3 C 75 146 192 TRUE
#> 4 D 86 1 140 TRUE
#> 5 E 44 77 127 TRUE
#> 6 F 16 91 77 TRUE
#> 7 G 31 76 136 TRUE
#> 8 H 48 78 106 TRUE
#> 9 I 67 35 44 FALSE
#> 10 J 91 182 93 TRUE
Try:
df <- read.table(header = TRUE, text = " Name DPD_1 DPD_2 DPD_3 Default_flag
1: A 46 63 138 TRUE
2: B 12 82 33 FALSE
3: C 95 71 55 TRUE
4: D 57 133 116 TRUE
5: E 48 27 137 TRUE")
df
df$Df.flag <- apply(df[, colnames(df) %in% paste0("DPD_", (1:3))],
1,
function(x) any(x > 90))
df
I have two vectors having common and repetitive elements. I want a table comparing the frequency of common elements in both vectors. Here is subset
plyr::count(V1)
x freq
1 A*02:01 106
2 A*02:02 88
3 A*03:01 95
4 A*03:02 60
plyr::count(V2)
x freq
1 A*02:01 11
2 A*02:02 11
3 A*02:04 1
4 A*03:01 20
The Output I want is:
x freq.V1 freq.V2
1 A*02:01 106 11
2 A*02:02 88 11
3 A*03:01 60 20
I think merge seems a good choice here as the default is to keep observations common to both datasets. So the following should work
merge(plyr::count(V1), plyr::count(V2), by="x")
Worked example
plyr::count(mtcars$gear)
# x freq
# 1 3 15
# 2 4 12
# 3 5 5
plyr::count(mtcars$gear[1:10])
# x freq
# 1 3 4
# 2 4 6
merge(
plyr::count(mtcars$gear),
plyr::count(mtcars$gear[1:10]),
by="x")
# x freq.x freq.y
# 1 3 15 4
# 2 4 12 6
Just use table:
tbl1 <- table(V1[V1 %in% (int <- intersect(unique(V1), unique(V2)))])
tbl2 <- table(V2[V2 %in% int])
data.frame(x = names(tbl1), freq.V1 = as.vector(tbl1), freq.V2 = as.vector(tbl2))
Or my favorite, data.table:
library(data.table)
DT <- data.table(V1 = V1, V2 = V2)
DT[V1 %in% unique(V2), .(freq.V1 = .N), by = .(x = V1)
][DT[V2 %in% unique(V1), .N, by = .(x = V2)],
freq.V2 := i.N, on = "x", nomatch = 0L]
Of course both options look much simpler if you know beforehand that V1 and V2 consist of the same set of elements:
data.frame(x = names(tbl1 <- table(V1)), freq.V1 = as.vector(tbl1),
freq.V2 = as.vector(table(V2)))
and
DT[ , .(freq.V1 = .N), by = .(x = V1)
][DT[ , .(freq.V2 = .N), by = .(x = V2)], on = "x"]