Dummies with quantile() in R - r

I am trying to assigne a dummy to a data entry depending on its quantile. So I got 3 quantiles 1/3 2/3 3/3, and if Leverage is in q1 it should add a 1 in a sep. column if q2 than 1 in another column (other columns stay 0).
This is my data sample:
k <- c("gvkey1" , "gvkey1" , "gvkey1" , "gvkey1", "gvkey2", "gvkey2", "gvkey2", "gvkey2", "gvkey2", "gvkey3", "gvkey3", "gvkey1" , "gvkey1" , "gvkey1" , "gvkey1", "gvkey2", "gvkey2", "gvkey2", "gvkey2", "gvkey2", "gvkey3", "gvkey3", "gvkey1" , "gvkey1" , "gvkey1" , "gvkey1", "gvkey2", "gvkey2", "gvkey2", "gvkey2", "gvkey2", "gvkey3", "gvkey3", "gvkey1" , "gvkey1" , "gvkey1" , "gvkey1", "gvkey2", "gvkey2", "gvkey2", "gvkey2", "gvkey2", "gvkey3", "gvkey3")
l <- c("12/1/2000", "12/1/2000", "12/3/2000", "12/4/2000" , "12/5/2000" , "12/6/2000" , "12/7/2000" , "12/8/2000" , "12/9/2000" , "12/10/2000" , "12/11/2000", "12/1/2000", "12/1/2000", "12/3/2000", "12/4/2000" , "12/5/2000" , "12/6/2000" , "12/7/2000" , "12/8/2000" , "12/9/2000" , "12/10/2000" , "12/11/2000", "12/1/2000", "12/1/2000", "12/3/2000", "12/4/2000" , "12/5/2000" , "12/6/2000" , "12/7/2000" , "12/8/2000" , "12/9/2000" , "12/10/2000" , "12/11/2000", "12/1/2000", "12/1/2000", "12/3/2000", "12/4/2000" , "12/5/2000" , "12/6/2000" , "12/7/2000" , "12/8/2000" , "12/9/2000" , "12/10/2000" , "12/11/2000", "12/1/2000", "12/1/2000", "12/3/2000", "12/4/2000" , "12/5/2000" , "12/6/2000" , "12/7/2000" , "12/8/2000" , "12/9/2000" , "12/10/2000" , "12/11/2000", "12/1/2000", "12/1/2000", "12/3/2000", "12/4/2000" , "12/5/2000" , "12/6/2000" , "12/7/2000" , "12/8/2000" , "12/9/2000" , "12/10/2000" , "12/11/2000")
m <- c(1:66)
y <- structure(list(a = l, b = k, c = m), .Names = c("Date", "gvkey" , "Leverage"),
row.names = c(NA, -66L), class = "data.frame")
y$Date <- as.Date(y$Date, format = "%m/%d/%Y")
test <- data.table(y)
and this is the code which should do as described above:
# quantile function per date
d1 <- paste("d1") # first breakpoint
test <- test[, (d1) := quantile(Leverage, (1/3)), by = "Date"]
d2 <- paste("d2") #second breakpoint
test <- test[, (d2) := quantile(Leverage, (2/3)), by = "Date"]
# match companies and quantiles
dquant1 <- paste("dquant1")
test <- test[, (dquant1) := ifelse(d1 < quantile(test$Leverage, 1/3), 1, 0), by = "Date"]
dquant2 <- paste("dquant2")
test <- test[, (d33_66) := ifelse((d1 > quantile(test$Leverage, 1/3) && (d2 < quantile(test$Leverage, 2/3))),1,0), by = "Date"]
dquant3 <- paste("dquant3")
test <- test[, (dquant3) := ifelse(d1 > quantile(test$Leverage, 2/3), 1, 0), by = "Date"]
The problem I got in my original data set is, that I sometimes get a dummy for 2 portfolios/ in 2 columns (e.g. 1 0 1) and thats what I wanna solve. For this sample I sometimes get not a single dummy.
Any suggestions welcome!
Thanks
Johannes

How about this approach??
test %>% rowwise() %>%
mutate(dquant = cut(Leverage,
breaks = c(0,d1,d2,max(Leverage)),
labels = c('100','010','001'))) %>% print(n=Inf)
# A tibble: 66 x 6
Date gvkey Leverage d1 d2 dquant
<date> <chr> <int> <dbl> <dbl> <fct>
1 2000-12-01 gvkey1 1 19.7 38.3 100
2 2000-12-01 gvkey1 2 19.7 38.3 100
3 2000-12-03 gvkey1 3 21.3 39.7 100
4 2000-12-04 gvkey1 4 22.3 40.7 100
5 2000-12-05 gvkey2 5 23.3 41.7 100
6 2000-12-06 gvkey2 6 24.3 42.7 100
7 2000-12-07 gvkey2 7 25.3 43.7 100
8 2000-12-08 gvkey2 8 26.3 44.7 100
9 2000-12-09 gvkey2 9 27.3 45.7 100
10 2000-12-10 gvkey3 10 28.3 46.7 100
11 2000-12-11 gvkey3 11 29.3 47.7 100
12 2000-12-01 gvkey1 12 19.7 38.3 100
13 2000-12-01 gvkey1 13 19.7 38.3 100
14 2000-12-03 gvkey1 14 21.3 39.7 100
15 2000-12-04 gvkey1 15 22.3 40.7 100
16 2000-12-05 gvkey2 16 23.3 41.7 100
17 2000-12-06 gvkey2 17 24.3 42.7 100
18 2000-12-07 gvkey2 18 25.3 43.7 100
19 2000-12-08 gvkey2 19 26.3 44.7 100
20 2000-12-09 gvkey2 20 27.3 45.7 100
21 2000-12-10 gvkey3 21 28.3 46.7 100
22 2000-12-11 gvkey3 22 29.3 47.7 100
23 2000-12-01 gvkey1 23 19.7 38.3 010
24 2000-12-01 gvkey1 24 19.7 38.3 010
25 2000-12-03 gvkey1 25 21.3 39.7 010
26 2000-12-04 gvkey1 26 22.3 40.7 010
27 2000-12-05 gvkey2 27 23.3 41.7 010
28 2000-12-06 gvkey2 28 24.3 42.7 010
29 2000-12-07 gvkey2 29 25.3 43.7 010
30 2000-12-08 gvkey2 30 26.3 44.7 010
31 2000-12-09 gvkey2 31 27.3 45.7 010
32 2000-12-10 gvkey3 32 28.3 46.7 010
33 2000-12-11 gvkey3 33 29.3 47.7 010
34 2000-12-01 gvkey1 34 19.7 38.3 010
35 2000-12-01 gvkey1 35 19.7 38.3 010
36 2000-12-03 gvkey1 36 21.3 39.7 010
37 2000-12-04 gvkey1 37 22.3 40.7 010
38 2000-12-05 gvkey2 38 23.3 41.7 010
39 2000-12-06 gvkey2 39 24.3 42.7 010
40 2000-12-07 gvkey2 40 25.3 43.7 010
41 2000-12-08 gvkey2 41 26.3 44.7 010
42 2000-12-09 gvkey2 42 27.3 45.7 010
43 2000-12-10 gvkey3 43 28.3 46.7 010
44 2000-12-11 gvkey3 44 29.3 47.7 010
45 2000-12-01 NA 45 19.7 38.3 001
46 2000-12-01 NA 46 19.7 38.3 001
47 2000-12-03 NA 47 21.3 39.7 001
48 2000-12-04 NA 48 22.3 40.7 001
49 2000-12-05 NA 49 23.3 41.7 001
50 2000-12-06 NA 50 24.3 42.7 001
51 2000-12-07 NA 51 25.3 43.7 001
52 2000-12-08 NA 52 26.3 44.7 001
53 2000-12-09 NA 53 27.3 45.7 001
54 2000-12-10 NA 54 28.3 46.7 001
55 2000-12-11 NA 55 29.3 47.7 001
56 2000-12-01 NA 56 19.7 38.3 001
57 2000-12-01 NA 57 19.7 38.3 001
58 2000-12-03 NA 58 21.3 39.7 001
59 2000-12-04 NA 59 22.3 40.7 001
60 2000-12-05 NA 60 23.3 41.7 001
61 2000-12-06 NA 61 24.3 42.7 001
62 2000-12-07 NA 62 25.3 43.7 001
63 2000-12-08 NA 63 26.3 44.7 001
64 2000-12-09 NA 64 27.3 45.7 001
65 2000-12-10 NA 65 28.3 46.7 001
66 2000-12-11 NA 66 29.3 47.7 001
More tricky solution is shown below :
d1 <- paste("d1") # first breakpoint
test <- test[, (d1) := quantile(Leverage, (1/3)), by = "Date"]
d2 <- paste("d2") #second breakpoint
test <- test[, (d2) := quantile(Leverage, (2/3)), by = "Date"]
## I will use the '|' operator in dquant
test = test %>% rowwise() %>%
mutate(s = cut(Leverage,
breaks = c(0,d1,d2,max(Leverage)),
labels = c('1|0|0','0|1|0','0|0|1')))
> test
# A tibble: 66 x 6
Date gvkey Leverage d1 d2 dquant
<date> <chr> <int> <dbl> <dbl> <fct>
1 2000-12-01 gvkey1 1 19.7 38.3 1|0|0
2 2000-12-01 gvkey1 2 19.7 38.3 1|0|0
After this, we have to split dquant column into multiple columns.
dummy <- data.frame(do.call('rbind',
strsplit(as.character(test$s),'|',fixed=TRUE)))
> dummy
X1 X2 X3
1 1 0 0
2 1 0 0
3 1 0 0
4 1 0 0
5 1 0 0
6 1 0 0
....
Finally, you got the answer like below
test = cbind(test,dummy)
> test
Date gvkey Leverage d1 d2 dquant X1 X2 X3
1 2000-12-01 gvkey1 1 19.66667 38.33333 1|0|0 1 0 0
2 2000-12-01 gvkey1 2 19.66667 38.33333 1|0|0 1 0 0
3 2000-12-03 gvkey1 3 21.33333 39.66667 1|0|0 1 0 0
4 2000-12-04 gvkey1 4 22.33333 40.66667 1|0|0 1 0 0
5 2000-12-05 gvkey2 5 23.33333 41.66667 1|0|0 1 0 0
6 2000-12-06 gvkey2 6 24.33333 42.66667 1|0|0 1 0 0
7 2000-12-07 gvkey2 7 25.33333 43.66667 1|0|0 1 0 0
8 2000-12-08 gvkey2 8 26.33333 44.66667 1|0|0 1 0 0
9 2000-12-09 gvkey2 9 27.33333 45.66667 1|0|0 1 0 0
10 2000-12-10 gvkey3 10 28.33333 46.66667 1|0|0 1 0 0
11 2000-12-11 gvkey3 11 29.33333 47.66667 1|0|0 1 0 0
...

Related

Converting and minimizing a data.frame in an efficient way

What is the fastest way to convert my data into my desired output?
m="
n min max median q1 q3 group m SD
1 30 30 55 44.5 43.25 49.75 treat 45.38524 5.593169
2 30 31 55 47.0 44.00 49.00 treat 46.11951 4.886821
3 30 40 55 48.0 45.00 51.00 treat 47.92676 4.173242
4 15 30 51 44.0 42.50 45.50 treat 43.21604 4.245150
5 15 31 54 46.0 42.50 48.50 treat 44.94723 5.759449
6 15 44 55 48.0 45.00 48.50 treat 47.66393 3.012334
7 15 39 55 49.0 44.00 52.00 treat 48.01439 5.571240
8 15 41 55 48.0 44.50 50.00 treat 47.59677 4.261415
9 15 40 55 47.0 45.00 50.00 treat 47.38081 4.200670
10 18 42 55 46.0 44.00 49.50 cont 46.91764 3.996259
11 18 40 55 44.0 43.00 47.00 cont 45.25704 3.667377
12 18 41 55 44.5 44.00 50.00 cont 46.58674 4.334604
13 9 42 49 46.0 43.00 48.00 cont 45.60879 3.357931
14 9 42 48 44.0 43.00 45.00 cont 44.29745 1.878592
15 9 41 55 44.0 44.00 45.00 cont 45.43229 2.779801
16 9 43 55 50.0 44.00 52.00 cont 48.73261 5.506545
17 9 43 55 46.0 44.00 51.00 cont 47.61981 5.069204
18 9 41 55 50.0 44.00 51.00 cont 48.19267 5.403842"
data <- read.table(text=m, h=T)
Desired_Output="
nT mT sdT nC mC sdC
30 45.38524 5.593169 18 46.91764 3.996259
30 46.11951 4.886821 18 45.25704 3.667377
30 47.92676 4.173242 18 46.58674 4.334604
. . . . . .
. . . . . .
. . . . . .
15 47.38081 4.200670 9 48.19267 5.403842"
Here is a base R solution,
l2 <- split(data[c('n', 'm', 'SD')], data$group)
do.call(cbind, Map(function(x, y){names(x) <- paste0(names(x), '_', y); x}, l2, names(l2)))
# cont.n_cont cont.m_cont cont.SD_cont treat.n_treat treat.m_treat treat.SD_treat
#10 18 46.91764 3.996259 30 45.38524 5.593169
#11 18 45.25704 3.667377 30 46.11951 4.886821
#12 18 46.58674 4.334604 30 47.92676 4.173242
#13 9 45.60879 3.357931 15 43.21604 4.245150
#14 9 44.29745 1.878592 15 44.94723 5.759449
#15 9 45.43229 2.779801 15 47.66393 3.012334
#16 9 48.73261 5.506545 15 48.01439 5.571240
#17 9 47.61981 5.069204 15 47.59677 4.261415
#18 9 48.19267 5.403842 15 47.38081 4.200670
It's little messy, but you may try using dplyr
library(dplyr)
data <- data %>%
group_by(group) %>%
mutate(idx = 1:n()) %>%
select(group, n, idx, m, SD) %>%
ungroup
df1 <- data %>%
filter(group == "treat") %>%
select(-group)
df2 <- data %>%
filter(group == "cont") %>%
select(-group)
df <- df1 %>%
left_join(df2, by = "idx") %>%
select(-idx)
names(df) <- c("nT", "mT", "sdT", "nC", "mC", "sdC")
df
nT mT sdT nC mC sdC
<int> <dbl> <dbl> <int> <dbl> <dbl>
1 30 45.4 5.59 18 46.9 4.00
2 30 46.1 4.89 18 45.3 3.67
3 30 47.9 4.17 18 46.6 4.33
4 15 43.2 4.25 9 45.6 3.36
5 15 44.9 5.76 9 44.3 1.88
6 15 47.7 3.01 9 45.4 2.78
7 15 48.0 5.57 9 48.7 5.51
8 15 47.6 4.26 9 47.6 5.07
9 15 47.4 4.20 9 48.2 5.40
Or using tidyr::pivot_wider,
library(tidyr)
data %>%
group_by(group) %>%
mutate(idx = 1:n()) %>%
select(group, n, idx, m, SD) %>%
ungroup %>%
pivot_wider(id_cols = idx, values_from = c(n, m, SD), names_from = group) %>%
select(-idx)
n_treat n_cont m_treat m_cont SD_treat SD_cont
<int> <int> <dbl> <dbl> <dbl> <dbl>
1 30 18 45.4 46.9 5.59 4.00
2 30 18 46.1 45.3 4.89 3.67
3 30 18 47.9 46.6 4.17 4.33
4 15 9 43.2 45.6 4.25 3.36
5 15 9 44.9 44.3 5.76 1.88
6 15 9 47.7 45.4 3.01 2.78
7 15 9 48.0 48.7 5.57 5.51
8 15 9 47.6 47.6 4.26 5.07
9 15 9 47.4 48.2 4.20 5.40
New answer, dplyr only solution:
library(dplyr)
df %>%
select(n, m, SD, group) %>%
slice(10:18) %>%
rename_with(~ paste0(.x, "C")) %>%
bind_cols(df[1:9, ]) %>%
rename_with( ~ paste0(.x, "T"), c(n, m, SD)) %>%
select(nT, mT, sdT=SDT, nC, mC, sdC=SDC)
nT mT sdT nC mC sdC
10 30 45.38524 5.593169 18 46.91764 3.996259
11 30 46.11951 4.886821 18 45.25704 3.667377
12 30 47.92676 4.173242 18 46.58674 4.334604
13 15 43.21604 4.245150 9 45.60879 3.357931
14 15 44.94723 5.759449 9 44.29745 1.878592
15 15 47.66393 3.012334 9 45.43229 2.779801
16 15 48.01439 5.571240 9 48.73261 5.506545
17 15 47.59677 4.261415 9 47.61981 5.069204
18 15 47.38081 4.200670 9 48.19267 5.403842

Apply a function to a set of columns in a dataset

using this function I calculate the variance of some 3d points.
centroid_3d_sq_dist <- function(
point_matrix
) {
if (nrow(point_matrix) == 1) {
return(0)
}
mean_point <- apply(point_matrix, 2, mean)
point_sq_distances <- apply(
point_matrix,
1,
function(row_point) {
sum((row_point - mean_point) ** 2)
}
)
sum_sq_distances <- sum(point_sq_distances)
return(sum_sq_distances)
}
point_3d_variance <- function(
point_matrix
) {
if (nrow(point_matrix) == 1) {
return(0)
}
dist_var <- centroid_3d_sq_dist(point_matrix) /
(nrow(point_matrix) - 1)
return(dist_var)
}
The argument of this function is a matrix (x,y,z).
Now I have a dataset with two 3D points.
ID Trial Size PP PA FkA ciccioX ciccioY ciccioZ pinoX pinoY pinoZ
1 Gigi 1 40 39.6 1050. 31.5 521. 293. 10.6 516. 323. 6.41
2 Gigi 2 20.0 30.7 944. 9.35 525. 300. 12.6 520. 305. 7.09
3 Gigi 3 30 29.5 1056. 24.1 521. 298. 12.3 519. 321. 5.89
4 Gigi 5 60 53.0 1190. 53.0 680. 287. 64.4 699. 336. 68.6
5 Bibi 1 40 38.3 1038. 31.4 524. 289. 10.9 519. 319. 6.17
6 Bibi 2 60 64.7 1293. 47.8 516. 282. 10.4 519. 330. 6.32
7 Bibi 3 20.0 33.8 1092. 17.5 523. 300. 12.8 518. 315. 6.22
8 Bibi 4 30 35.0 1108. 26.4 525. 295. 11.7 517. 320. 5.78
9 Bibi 5 50 46.5 1199. 34.2 515. 289. 11.2 517. 323. 6.27
10 Bibi 6 30 28.7 1016. 17.1 528. 298. 12.7 524. 314. 6.36
The 3D points are:
ciccio: ciccioX ciccioY ciccioZ
pino: pinoX pinoY pinoZ
I want to calculate the variance of ciccio and the variance of pino grouped by ID and SIZE.
I tried to do:
data %>%
group_by(SubjectID, Size) %>%
summarize(as.data.frame(matrix(f4(dd[7:9],dd[10:12]), nr = 1)))
But it doesn't work.
Do you have any advice?
Your shown dataset is too small to calculate (meaningful) variations. But you could use:
library(dplyr)
df %>%
group_by(ID, Size) %>%
summarise(var_ciccio = point_3d_variance(as.matrix(across(ciccioX:ciccioZ))),
var_pino = point_3d_variance(as.matrix(across(pinoX:pinoZ))),
.groups = "drop")
This returns
# A tibble: 9 x 4
ID Size var_ciccio var_pinoo
<chr> <dbl> <dbl> <dbl>
1 Bibi 20 0 0
2 Bibi 30 9.5 42.7
3 Bibi 40 0 0
4 Bibi 50 0 0
5 Bibi 60 0 0
6 Gigi 20 0 0
7 Gigi 30 0 0
8 Gigi 40 0 0
9 Gigi 60 0 0

Add new column to state data frame based on other column data [duplicate]

This question already has answers here:
Categorize numeric variable into group/ bins/ breaks
(4 answers)
Closed 1 year ago.
I am attempting to add a new column to the state sample data frame in R. I am hoping for this column to cluster the ID of states into broader categories (1-4). My code is close to what I am looking for but I am not getting it quite right.. I know I could enter each state ID line by line but is there a a quicker way? Thank you!
library(tidyverse)
#Add column to denote each state
States=state.x77
States=data.frame(States)
States <- tibble::rowid_to_column(States, "ID")
States
#Create new variable for state buckets
States <- States %>%
mutate(WAGE_BUCKET=case_when(ID <= c(1,12) ~ '1',
ID <= c(13,24) ~ '2',
ID <= c(25,37) ~ '3',
ID <= c(38,50) ~ '4',
TRUE ~ 'NA'))
View(States) #It is not grouping the states in the way I want/I am still getting some NA values but unsure why!
You can use cut or findInterval if all of your groups will be using contiguous ID values:
findInterval(States$ID, c(0, 12, 24, 37, 51))
# [1] 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4
If you want to make it a bit more verbose, you can use dplyr::between in your case_when:
States %>%
mutate(
WAGE_BUCKET = case_when(
between(ID, 1, 12) ~ "1",
between(ID, 13, 24) ~ "2",
between(ID, 25, 37) ~ "3",
between(ID, 38, 50) ~ "4",
TRUE ~ NA_character_)
)
# ID Population Income Illiteracy Life Exp Murder HS Grad Frost Area WAGE_BUCKET
# 1 1 3615 3624 2.1 69.05 15.1 41.3 20 50708 1
# 2 2 365 6315 1.5 69.31 11.3 66.7 152 566432 1
# 3 3 2212 4530 1.8 70.55 7.8 58.1 15 113417 1
# 4 4 2110 3378 1.9 70.66 10.1 39.9 65 51945 1
# 5 5 21198 5114 1.1 71.71 10.3 62.6 20 156361 1
# 6 6 2541 4884 0.7 72.06 6.8 63.9 166 103766 1
# 7 7 3100 5348 1.1 72.48 3.1 56.0 139 4862 1
# 8 8 579 4809 0.9 70.06 6.2 54.6 103 1982 1
# 9 9 8277 4815 1.3 70.66 10.7 52.6 11 54090 1
# 10 10 4931 4091 2.0 68.54 13.9 40.6 60 58073 1
# 11 11 868 4963 1.9 73.60 6.2 61.9 0 6425 1
# 12 12 813 4119 0.6 71.87 5.3 59.5 126 82677 1
# 13 13 11197 5107 0.9 70.14 10.3 52.6 127 55748 2
# 14 14 5313 4458 0.7 70.88 7.1 52.9 122 36097 2
# 15 15 2861 4628 0.5 72.56 2.3 59.0 140 55941 2
# 16 16 2280 4669 0.6 72.58 4.5 59.9 114 81787 2
# 17 17 3387 3712 1.6 70.10 10.6 38.5 95 39650 2
# 18 18 3806 3545 2.8 68.76 13.2 42.2 12 44930 2
# 19 19 1058 3694 0.7 70.39 2.7 54.7 161 30920 2
# 20 20 4122 5299 0.9 70.22 8.5 52.3 101 9891 2
# 21 21 5814 4755 1.1 71.83 3.3 58.5 103 7826 2
# 22 22 9111 4751 0.9 70.63 11.1 52.8 125 56817 2
# 23 23 3921 4675 0.6 72.96 2.3 57.6 160 79289 2
# 24 24 2341 3098 2.4 68.09 12.5 41.0 50 47296 2
# 25 25 4767 4254 0.8 70.69 9.3 48.8 108 68995 3
# 26 26 746 4347 0.6 70.56 5.0 59.2 155 145587 3
# 27 27 1544 4508 0.6 72.60 2.9 59.3 139 76483 3
# 28 28 590 5149 0.5 69.03 11.5 65.2 188 109889 3
# 29 29 812 4281 0.7 71.23 3.3 57.6 174 9027 3
# 30 30 7333 5237 1.1 70.93 5.2 52.5 115 7521 3
# 31 31 1144 3601 2.2 70.32 9.7 55.2 120 121412 3
# 32 32 18076 4903 1.4 70.55 10.9 52.7 82 47831 3
# 33 33 5441 3875 1.8 69.21 11.1 38.5 80 48798 3
# 34 34 637 5087 0.8 72.78 1.4 50.3 186 69273 3
# 35 35 10735 4561 0.8 70.82 7.4 53.2 124 40975 3
# 36 36 2715 3983 1.1 71.42 6.4 51.6 82 68782 3
# 37 37 2284 4660 0.6 72.13 4.2 60.0 44 96184 3
# 38 38 11860 4449 1.0 70.43 6.1 50.2 126 44966 4
# 39 39 931 4558 1.3 71.90 2.4 46.4 127 1049 4
# 40 40 2816 3635 2.3 67.96 11.6 37.8 65 30225 4
# 41 41 681 4167 0.5 72.08 1.7 53.3 172 75955 4
# 42 42 4173 3821 1.7 70.11 11.0 41.8 70 41328 4
# 43 43 12237 4188 2.2 70.90 12.2 47.4 35 262134 4
# 44 44 1203 4022 0.6 72.90 4.5 67.3 137 82096 4
# 45 45 472 3907 0.6 71.64 5.5 57.1 168 9267 4
# 46 46 4981 4701 1.4 70.08 9.5 47.8 85 39780 4
# 47 47 3559 4864 0.6 71.72 4.3 63.5 32 66570 4
# 48 48 1799 3617 1.4 69.48 6.7 41.6 100 24070 4
# 49 49 4589 4468 0.7 72.48 3.0 54.5 149 54464 4
# 50 50 376 4566 0.6 70.29 6.9 62.9 173 97203 4
It is a vector of length > 1. The comparison operators works on a single vector. We could use between
library(dplyr)
States <- States %>%
mutate(WAGE_BUCKET=case_when(between(ID, 1, 12) ~ '1',
between(ID, 13,24) ~ '2',
between(ID, 25,37) ~ '3',
between(ID, 38,50) ~ '4',
TRUE ~ NA_character_))
Or another option is to use & with > and <=
States %>%
mutate(WAGE_BUCKET=case_when(ID >= 1 & ID <=12 ~ '1',
ID >= 13 & ID <= 24) ~ '2',
ID >= 25 & ID <= 37 ~ '3',
ID >= 38 & ID <= 50 ~ '4',
TRUE ~ NA_character))
Or may be the OP meant to use %in%
States %>%
mutate(WAGE_BUCKET=case_when(ID %in% c(1,12) ~ '1',
ID %in% c(13,24) ~ '2',
ID %in% c(25,37) ~ '3',
ID %in% c(38,50) ~ '4',
TRUE ~ NA_character_))

Calculate difference beetwen a big number of variables

I am trying to calculate differences beetwen differents columns, I did it with a loop but I know that is not a elegant solution and not the best in R (not efficient) also my results have duplicated results and not logical operation (disp-disp or hp_disp and disp_hp).
My real data have Na, I tried to simulate them. My goal is try to improvement my command to get the same table below.
An example of my command is like:
names(mtcars)
mtcars$mpg[mtcars$am==1]=NA
vars1= c("mpg","cyl","disp","hp")
vars2= c("mpg","cyl","disp","hp")
df=data.frame()
df_all=data.frame()
df_all=length(mtcars)
for(i in vars1){
for(k in vars2) {
df= mtcars[[i]]-mtcars[[k]]
df_all=cbind(df_all, df)
length =ncol(df_all)
colnames(df_all)[length]= paste0(i,"_",k)
}
}
head(df_all)
disp_mpg disp_cyl disp_disp disp_hp hp_mpg hp_cyl hp_disp hp_hp
[1,] NA 154 0 50 NA 104 -50 0
[2,] NA 154 0 50 NA 104 -50 0
[3,] NA 104 0 15 NA 89 -15 0
[4,] 236.6 252 0 148 88.6 104 -148 0
[5,] 341.3 352 0 185 156.3 167 -185 0
[6,] 206.9 219 0 120 86.9 99 -120 0
Here's one way to do that, using the data.table library
library(data.table)
vars = c("mpg","cyl","disp","hp")
# create table of pairs to diff
to_diff <- CJ(vars, vars)[V1 < V2]
# calculate diffs
diffs <-
to_diff[, .(diff_val = mtcars[, V1] - mtcars[, V2]),
by = .(cols = paste0(V1, '_minus_', V2))]
# number each row in each "cols" group
diffs[, rid := rowid(cols)]
# transform so that rid determines the row, cols determines the col, and
# the values are the value of diff_val
dcast(diffs, rid ~ cols, value.var = 'diff_val')
Output
#
# rid cyl_minus_disp cyl_minus_hp cyl_minus_mpg disp_minus_hp disp_minus_mpg hp_minus_mpg
# 1: 1 -154.0 -104 -15.0 50.0 139.0 89.0
# 2: 2 -154.0 -104 -15.0 50.0 139.0 89.0
# 3: 3 -104.0 -89 -18.8 15.0 85.2 70.2
# 4: 4 -252.0 -104 -15.4 148.0 236.6 88.6
# 5: 5 -352.0 -167 -10.7 185.0 341.3 156.3
# 6: 6 -219.0 -99 -12.1 120.0 206.9 86.9
# 7: 7 -352.0 -237 -6.3 115.0 345.7 230.7
# 8: 8 -142.7 -58 -20.4 84.7 122.3 37.6
# 9: 9 -136.8 -91 -18.8 45.8 118.0 72.2
# 10: 10 -161.6 -117 -13.2 44.6 148.4 103.8
# 11: 11 -161.6 -117 -11.8 44.6 149.8 105.2
# 12: 12 -267.8 -172 -8.4 95.8 259.4 163.6
# 13: 13 -267.8 -172 -9.3 95.8 258.5 162.7
# 14: 14 -267.8 -172 -7.2 95.8 260.6 164.8
# 15: 15 -464.0 -197 -2.4 267.0 461.6 194.6
# 16: 16 -452.0 -207 -2.4 245.0 449.6 204.6
# 17: 17 -432.0 -222 -6.7 210.0 425.3 215.3
# 18: 18 -74.7 -62 -28.4 12.7 46.3 33.6
# 19: 19 -71.7 -48 -26.4 23.7 45.3 21.6
# 20: 20 -67.1 -61 -29.9 6.1 37.2 31.1
# 21: 21 -116.1 -93 -17.5 23.1 98.6 75.5
# 22: 22 -310.0 -142 -7.5 168.0 302.5 134.5
# 23: 23 -296.0 -142 -7.2 154.0 288.8 134.8
# 24: 24 -342.0 -237 -5.3 105.0 336.7 231.7
# 25: 25 -392.0 -167 -11.2 225.0 380.8 155.8
# 26: 26 -75.0 -62 -23.3 13.0 51.7 38.7
# 27: 27 -116.3 -87 -22.0 29.3 94.3 65.0
# 28: 28 -91.1 -109 -26.4 -17.9 64.7 82.6
# 29: 29 -343.0 -256 -7.8 87.0 335.2 248.2
# 30: 30 -139.0 -169 -13.7 -30.0 125.3 155.3
# 31: 31 -293.0 -327 -7.0 -34.0 286.0 320.0
# 32: 32 -117.0 -105 -17.4 12.0 99.6 87.6
# rid cyl_minus_disp cyl_minus_hp cyl_minus_mpg disp_minus_hp disp_minus_mpg hp_minus_mpg

How to index dataframe column inside a function in R

I have a function that takes in a dataframe, a percentile threshold, and the name of a given column, and computes all values that are above this threshold in the given column as a new column (0 for <, and 1 for >=). However, it won't allow me to do the df$column_name inside the quantile function because column_name is not actually a column name, but a variable storing the actual column name. Therefore df$column_name will return NULL. Is there any way to work around this and keep the code forma somewhat similar to what it is currently? Or do I have to specify the actual numerical column value instead of the name? While I can do this, it is definitely not as convenient/comprehensible as just passing in the column name.
func1 <- function(df, threshold, column_name) {
threshold_value <- quantile(df$column_name, c(threshold))
new_df <- df %>%
mutate(ifelse(column_name > threshold_value, 1, 0))
return(new_df)
}
Thank you so much for your help!
I modified your function as follows. Now the function can take a data frame, a threshold, and a column name. This function only needs the base R.
# Modified function
func1 <- function(df, threshold, column_name) {
threshold_value <- quantile(df[[column_name]], threshold)
new_df <- df
new_df[["new_col"]] <- ifelse(df[[column_name]] > threshold_value, 1, 0)
return(new_df)
}
# Take the trees data frame as an example
head(trees)
# Girth Height Volume
# 1 8.3 70 10.3
# 2 8.6 65 10.3
# 3 8.8 63 10.2
# 4 10.5 72 16.4
# 5 10.7 81 18.8
# 6 10.8 83 19.7
# Apply the function
func1(trees, 0.5, "Volume")
# Girth Height Volume new_col
# 1 8.3 70 10.3 0
# 2 8.6 65 10.3 0
# 3 8.8 63 10.2 0
# 4 10.5 72 16.4 0
# 5 10.7 81 18.8 0
# 6 10.8 83 19.7 0
# 7 11.0 66 15.6 0
# 8 11.0 75 18.2 0
# 9 11.1 80 22.6 0
# 10 11.2 75 19.9 0
# 11 11.3 79 24.2 0
# 12 11.4 76 21.0 0
# 13 11.4 76 21.4 0
# 14 11.7 69 21.3 0
# 15 12.0 75 19.1 0
# 16 12.9 74 22.2 0
# 17 12.9 85 33.8 1
# 18 13.3 86 27.4 1
# 19 13.7 71 25.7 1
# 20 13.8 64 24.9 1
# 21 14.0 78 34.5 1
# 22 14.2 80 31.7 1
# 23 14.5 74 36.3 1
# 24 16.0 72 38.3 1
# 25 16.3 77 42.6 1
# 26 17.3 81 55.4 1
# 27 17.5 82 55.7 1
# 28 17.9 80 58.3 1
# 29 18.0 80 51.5 1
# 30 18.0 80 51.0 1
# 31 20.6 87 77.0 1
If you still want to use dplyr, it is essential to learn how to deal with non-standard evaluation. Please see this to learn more (https://cran.r-project.org/web/packages/dplyr/vignettes/programming.html). The following code will also works.
library(dplyr)
func2 <- function(df, threshold, column_name) {
col_en <- enquo(column_name)
threshold_value <- quantile(df %>% pull(!!col_en), threshold)
new_df <- df %>%
mutate(new_col := ifelse(!!col_en >= threshold_value, 1, 0))
return(new_df)
}
func2(trees, 0.5, Volume)
# Girth Height Volume new_col
# 1 8.3 70 10.3 0
# 2 8.6 65 10.3 0
# 3 8.8 63 10.2 0
# 4 10.5 72 16.4 0
# 5 10.7 81 18.8 0
# 6 10.8 83 19.7 0
# 7 11.0 66 15.6 0
# 8 11.0 75 18.2 0
# 9 11.1 80 22.6 0
# 10 11.2 75 19.9 0
# 11 11.3 79 24.2 1
# 12 11.4 76 21.0 0
# 13 11.4 76 21.4 0
# 14 11.7 69 21.3 0
# 15 12.0 75 19.1 0
# 16 12.9 74 22.2 0
# 17 12.9 85 33.8 1
# 18 13.3 86 27.4 1
# 19 13.7 71 25.7 1
# 20 13.8 64 24.9 1
# 21 14.0 78 34.5 1
# 22 14.2 80 31.7 1
# 23 14.5 74 36.3 1
# 24 16.0 72 38.3 1
# 25 16.3 77 42.6 1
# 26 17.3 81 55.4 1
# 27 17.5 82 55.7 1
# 28 17.9 80 58.3 1
# 29 18.0 80 51.5 1
# 30 18.0 80 51.0 1
# 31 20.6 87 77.0 1

Resources