Let's say I have the following data:
set.seed(123)
test <- tibble(
ID = sample(rep(1:100, rpois(100,4))),
vals = abs(round(rnorm(length(ID), 10000, 5000)))
)
I would like to sort test first by vals and then by ID with the desired output looking like this:
# A tibble: 409 x 2
ID vals
<int> <dbl>
1 48 26522
2 48 14427
3 48 7570
4 48 5922
5 92 25286
6 92 10436
7 92 5705
8 92 4036
9 92 3399
10 64 22190
# ... with 399 more rows
i.e. it should group the sorting by ID and then in decreasing order vals.
What I tried:
test %>% arrange(ID, desc(vals))
test %>% arrange(desc(vals), ID)
test %>% arrange(ID) %>% arrange(desc(vals))
I think you're missing some clarity:
sort first by each ID's maximum value, descending;
Try this:
library(dplyr)
test %>%
mutate(valrank = dense_rank(-vals)) %>%
group_by(ID) %>%
mutate(valrank = min(valrank)) %>%
ungroup() %>%
arrange(valrank, ID, desc(vals))
# # A tibble: 409 x 3
# ID vals valrank
# <int> <dbl> <int>
# 1 48 26522 1
# 2 48 14427 1
# 3 48 7570 1
# 4 48 5922 1
# 5 92 25286 2
# 6 92 10436 2
# 7 92 5705 2
# 8 92 4036 2
# 9 92 3399 2
# 10 64 22190 3
# # ... with 399 more rows
(I kept valrank just for demonstration.)
Related
I am applying a user defined function on numeric variables from a dataset but instead of getting their name's I am getting x when applied using map function. How do I replace x with variable name in map functions?
dataset: hd_trn
age sex cp trestbps chol fbs restecg thalach exang
<int> <fctr> <fctr> <int> <int> <fctr> <fctr> <int> <fctr>
63 1 1 145 233 1 2 150 0
67 1 4 160 286 0 2 108 1
67 1 4 120 229 0 2 129 1
37 1 3 130 250 0 0 187 0
41 0 2 130 204 0 2 172 0
56 1 2 120 236 0 0 178 0
user defined function to calculate high freq elements column wise
top_freq_elements <- function(x){
table(x) %>% as.data.frame() %>% top_n(5, Freq) %>% arrange(desc(Freq))
}
Applying function
hd_trn %>% select_if(is.numeric) %>% map(., .f = top_freq_elements)
######### output #########
x Freq
<fctr> <int>
54 51
58 43
55 41
56 38
57 38
desired: In the above output I am looking to get variable name instead of x
Tried reconstructing code below using imap but that is also not giving variable name:
hd_trn %>%
select_if(is.numeric) %>%
imap(function(feature_value, feature_name){
table(feature_value) %>%
as.data.frame() %>% #head()
rename(feature_name = feature_value) %>%
top_n(5, Freq) %>%
arrange(desc(Freq))
})
######### output #########
feature_name Freq
<fctr> <int>
54 51
58 43
55 41
56 38
57 38
You can rename the 1st column in each list :
library(dplyr)
library(purrr)
iris %>%
select(where(is.numeric)) %>%
imap(function(feature_value, feature_name){
table(feature_value) %>%
as.data.frame() %>%
rename_with(~feature_name, 1) %>%
slice_max(n = 5, Freq) %>%
arrange(desc(Freq))
})
This could be achieved using e.g. curly-curly {{ and := in rename like so:
top_freq_elements <- function(x){
table(x) %>% as.data.frame() %>% top_n(5, Freq) %>% arrange(desc(Freq))
}
library(dplyr)
library(purrr)
hd_trn %>%
select_if(is.numeric) %>%
imap(function(feature_value, feature_name){
table(feature_value) %>%
as.data.frame() %>% #head()
rename({{feature_name}} := feature_value) %>%
top_n(5, Freq) %>%
arrange(desc(Freq))
})
#> $age
#> age Freq
#> 1 67 2
#> 2 37 1
#> 3 41 1
#> 4 56 1
#> 5 63 1
#>
#> $sex
#> sex Freq
#> 1 1 5
#> 2 0 1
#>
#> $cp
#> cp Freq
#> 1 2 2
#> 2 4 2
#> 3 1 1
#> 4 3 1
#>
#> $trestbps
#> trestbps Freq
#> 1 120 2
#> 2 130 2
#> 3 145 1
#> 4 160 1
I am giving a data set called ChickWeight. This has the weights of chicks over a time period. I need to introduce a new variable that measures the current weight difference compared to day 0. The data set is in library(datasets) so you should have it.
library(dplyr)
weightgain <- ChickWeight %>%
group_by(Chick) %>%
filter(any(Time == 21)) %>%
mutate(weightgain = weight - first(weight))
I have this code, but this code just subtracts each weight by 42 which is the weight at time 0 for chick 1. I need each chick to be subtracted by its own weight at time 0 so that the weightgain column is correct.
We could do
library(dplyr)
ChickWeight %>%
group_by(Chick) %>%
mutate(weightgain = weight - weight[Time == 0])
#Or mutate(weightgain = weight - first(weight))
# A tibble: 578 x 5
# Groups: Chick [50]
# weight Time Chick Diet weightgain
# <dbl> <dbl> <ord> <fct> <dbl>
# 1 42 0 1 1 0
# 2 51 2 1 1 9
# 3 59 4 1 1 17
# 4 64 6 1 1 22
# 5 76 8 1 1 34
# 6 93 10 1 1 51
# 7 106 12 1 1 64
# 8 125 14 1 1 83
# 9 149 16 1 1 107
#10 171 18 1 1 129
# … with 568 more rows
Or using base R ave
with(ChickWeight, ave(weight, Chick, FUN = function(x) x - x[1]))
I have a data frame set up similar to this:
id <- c(123,234,123,234)
task <- c(54,23,12,58)
a <- c(23,67,45,89)
b <- c(78,45,65,45)
df <- data.frame(id,task,a,b)
> df
id task a b
1 123 54 23 78
2 234 23 67 45
3 123 12 45 65
4 234 58 89 45
where I score a and b for each ID:
df$score <- rowMeans(subset(df, select = c(3:4)), na.rm = TRUE)
> df
id task a b score
1 123 54 23 78 50.5
2 234 23 67 45 56.0
3 123 12 45 65 55.0
4 234 58 89 45 67.0
for each id I got an aggregate score like such:
out <- ddply(df, 1, summarise,
overall = mean(score, na.rm = TRUE))
> out
id overall
1 123 52.75
2 234 61.50
but what I want my final output to have is a new column that has the scores that went into the overall and their task id like this:
id overall meta
1 123 52.75 "task_scores":[{"54":50.5,"12":55}]
2 234 61.50 "task_scores":[{"23":56,"58":67}]
how would I go about doing that using R?
We could make use of jsonlite to create the structure
library(jsonlite)
library(plyr)
ddply(df, "id", summarise, overall = mean(score, na.rm = TRUE),
meta = paste0('"task_scores":',
toJSON(setNames(as.data.frame.list(score), task))))
# id overall meta
#1 123 52.75 "task_scores":[{"54":50.5,"12":55}]
#2 234 61.50 "task_scores":[{"23":56,"58":67}]
I don't know how to make that metadata dictionary offhand, but you could do something like this:
library(dplyr)
library(magrittr)
out <- df %>% group_by(id) %>% mutate(overall = mean(score))
> out
# A tibble: 4 x 6
# Groups: id [2]
id task a b score overall
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 123 54 23 78 50.5 52.8
2 234 23 67 45 56 61.5
3 123 12 45 65 55 52.8
4 234 58 89 45 67 61.5
So the df would have both the aggregated scores and preserve the data in the original rows.
You can do it with a few mutates. Paste your tallies, get your row average, then your group average.
library(dplyr)
df %>%
mutate(score = rowMeans(subset(., select = c(3:4)), na.rm = TRUE)) %>%
group_by(id) %>%
mutate(overall = mean(score)) %>%
mutate(tally = paste(task, score, sep = ":", collapse = ","))
# A tibble: 4 x 7
# Groups: id [2]
id task a b score overall tally
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 123 54 23 78 50.5 52.8 54:50.5,12:55
2 234 23 67 45 56 61.5 23:56,58:67
3 123 12 45 65 55 52.8 54:50.5,12:55
4 234 58 89 45 67 61.5 23:56,58:67
And to get your desired final output, just select and slice.
df %>%
mutate(score = rowMeans(subset(., select = c(3:4)), na.rm = TRUE)) %>%
group_by(id) %>%
mutate(overall = mean(score)) %>%
mutate(tally = paste(task, score, sep = ":", collapse = ",")) %>%
select(id, overall, tally) %>%
slice(1)
# A tibble: 1 x 3
id overall tally
<dbl> <dbl> <chr>
1 123 52.8 54:50.5,12:55
2 234 61.5 23:56,58:67
I'd like to be able to find the most frequently occurring level in a factor in a dataset while using dplyr's piping structure. I'm trying to create a new variable that contains the 'modal' factor level when being grouped by another variable.
This is an example of what I'm looking for:
df <- data.frame(cat = stringi::stri_rand_strings(100, 1, '[A-Z]'), num = floor(runif(100, min=0, max=500)))
df <- df %>%
dplyr::group_by(cat) %>%
dplyr::mutate(cat_mode = Mode(num))
Where "Mode" is a function that I'm looking for
Use table to count the items and then use which.max to find out the most frequent one:
df %>%
group_by(cat) %>%
mutate(cat_mode = names(which.max(table(num)))) %>%
head()
# A tibble: 6 x 3
# Groups: cat [4]
# cat num cat_mode
# <fctr> <dbl> <chr>
#1 Q 305 138
#2 W 34.0 212
#3 R 53.0 53
#4 D 395 5
#5 W 212 212
#6 Q 417 138
# ...
similar question to Is there a built-in function for finding the mode?
Mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
df %>%
group_by(cat) %>%
mutate(cat_mode = Mode(num))
# A tibble: 100 x 3
# Groups: cat [26]
cat num cat_mode
<fct> <dbl> <dbl>
1 S 25 25
2 V 86 478
3 R 335 335
4 S 288 25
5 S 330 25
6 Q 384 384
7 C 313 313
8 H 275 275
9 K 274 274
10 J 75 75
# ... with 90 more rows
To see for each factor
df %>%
group_by(cat) %>%
summarise(cat_mode = Mode(num))
A tibble: 26 x 2
cat cat_mode
<fct> <dbl>
1 A 480
2 B 380
3 C 313
4 D 253
5 E 202
6 F 52
7 G 182
8 H 275
9 I 356
10 J 75
# ... with 16 more rows
Question:
I am using dplyr to do data analysis in R, and I come across the following problem.
My data frame is like this:
item day val
1 A 1 90
2 A 2 100
3 A 3 110
4 A 5 80
5 A 8 70
6 B 1 75
7 B 3 65
The data frame is already arranged in item, day. Now I want to mutate a new column, with each row being the smallest value of the same group AND having the day to be within the next 2 days.
For the example above, I want the resulting data frame to be:
item day val output
1 A 1 90 100 # the smaller of 100 and 110
2 A 2 100 110 # the only value within 2 days
3 A 3 110 80 # the only value within 2 days
4 A 5 80 NA # there is no data within 2 days
5 A 8 70 NA # there is no data within 2 days
6 B 1 75 65 # the only value within 2 days
7 B 3 65 NA # there is no data within 2 days
I understand that I will probably use group_by and mutate, but how to write the inside function in order to achieve my desired result?
Any help is greatly appreciated. Let me know if you need me to clarify anything. Thank you!
Try this:
df %>%
# arrange(item, day) %>% # if not already arranged
# take note of the next two values & corresponding difference in days
group_by(item) %>%
mutate(val.1 = lead(val),
day.1 = lead(day) - day,
val.2 = lead(val, 2),
day.2 = lead(day, 2) - day) %>%
ungroup() %>%
# if the value is associated with a day more than 2 days away, change it to NA
mutate(val.1 = ifelse(day.1 %in% c(1, 2), val.1, NA),
val.2 = ifelse(day.2 %in% c(1, 2), val.2, NA)) %>%
# calculate output normally
group_by(item, day) %>%
mutate(output = min(val.1, val.2, na.rm = TRUE)) %>%
ungroup() %>%
# arrange results
select(item, day, val, output) %>%
mutate(output = ifelse(output == Inf, NA, output)) %>%
arrange(item, day)
# A tibble: 7 x 4
item day val output
<fctr> <int> <int> <dbl>
1 A 1 90 100
2 A 2 100 110
3 A 3 110 80.0
4 A 5 80 NA
5 A 8 70 NA
6 B 1 75 65.0
7 B 3 65 NA
Data:
df <- read.table(text = " item day val
1 A 1 90
2 A 2 100
3 A 3 110
4 A 5 80
5 A 8 70
6 B 1 75
7 B 3 65", header = TRUE)
We can use complete from the tidyr package to complete the dataset by day, and then use lead from dplyr and rollapply from zoo to find the minimum of the next two days.
library(dplyr)
library(tidyr)
library(zoo)
DF2 <- DF %>%
group_by(item) %>%
complete(day = full_seq(day, period = 1)) %>%
mutate(output = rollapply(lead(val), width = 2, FUN = min, na.rm = TRUE,
fill = NA, align = "left")) %>%
drop_na(val) %>%
ungroup() %>%
mutate(output = ifelse(output == Inf, NA, output))
DF2
# # A tibble: 7 x 4
# item day val output
# <chr> <dbl> <int> <dbl>
# 1 A 1.00 90 100
# 2 A 2.00 100 110
# 3 A 3.00 110 80.0
# 4 A 5.00 80 NA
# 5 A 8.00 70 NA
# 6 B 1.00 75 65.0
# 7 B 3.00 65 NA
DATA
DF <- read.table(text = "item day val
1 A 1 90
2 A 2 100
3 A 3 110
4 A 5 80
5 A 8 70
6 B 1 75
7 B 3 65",
header = TRUE, stringsAsFactors = FALSE)
We'll create a dataset with modified day, so we can left join it on the original dataset, keeping only minimum value.
df %>%
left_join(
bind_rows(mutate(.,day=day-1),mutate(.,day=day-2)) %>% rename(output=val)) %>%
group_by(item,day,val) %>%
summarize_at("output",min) %>%
ungroup
# # A tibble: 7 x 4
# item day val output
# <fctr> <dbl> <int> <dbl>
# 1 A 1 90 100
# 2 A 2 100 110
# 3 A 3 110 80
# 4 A 5 80 NA
# 5 A 8 70 NA
# 6 B 1 75 65
# 7 B 3 65 NA
data
df <- read.table(text = " item day val
1 A 1 90
2 A 2 100
3 A 3 110
4 A 5 80
5 A 8 70
6 B 1 75
7 B 3 65", header = TRUE)