Create "metadata" field in R - r

I have a data frame set up similar to this:
id <- c(123,234,123,234)
task <- c(54,23,12,58)
a <- c(23,67,45,89)
b <- c(78,45,65,45)
df <- data.frame(id,task,a,b)
> df
id task a b
1 123 54 23 78
2 234 23 67 45
3 123 12 45 65
4 234 58 89 45
where I score a and b for each ID:
df$score <- rowMeans(subset(df, select = c(3:4)), na.rm = TRUE)
> df
id task a b score
1 123 54 23 78 50.5
2 234 23 67 45 56.0
3 123 12 45 65 55.0
4 234 58 89 45 67.0
for each id I got an aggregate score like such:
out <- ddply(df, 1, summarise,
overall = mean(score, na.rm = TRUE))
> out
id overall
1 123 52.75
2 234 61.50
but what I want my final output to have is a new column that has the scores that went into the overall and their task id like this:
id overall meta
1 123 52.75 "task_scores":[{"54":50.5,"12":55}]
2 234 61.50 "task_scores":[{"23":56,"58":67}]
how would I go about doing that using R?

We could make use of jsonlite to create the structure
library(jsonlite)
library(plyr)
ddply(df, "id", summarise, overall = mean(score, na.rm = TRUE),
meta = paste0('"task_scores":',
toJSON(setNames(as.data.frame.list(score), task))))
# id overall meta
#1 123 52.75 "task_scores":[{"54":50.5,"12":55}]
#2 234 61.50 "task_scores":[{"23":56,"58":67}]

I don't know how to make that metadata dictionary offhand, but you could do something like this:
library(dplyr)
library(magrittr)
out <- df %>% group_by(id) %>% mutate(overall = mean(score))
> out
# A tibble: 4 x 6
# Groups: id [2]
id task a b score overall
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 123 54 23 78 50.5 52.8
2 234 23 67 45 56 61.5
3 123 12 45 65 55 52.8
4 234 58 89 45 67 61.5
So the df would have both the aggregated scores and preserve the data in the original rows.

You can do it with a few mutates. Paste your tallies, get your row average, then your group average.
library(dplyr)
df %>%
mutate(score = rowMeans(subset(., select = c(3:4)), na.rm = TRUE)) %>%
group_by(id) %>%
mutate(overall = mean(score)) %>%
mutate(tally = paste(task, score, sep = ":", collapse = ","))
# A tibble: 4 x 7
# Groups: id [2]
id task a b score overall tally
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 123 54 23 78 50.5 52.8 54:50.5,12:55
2 234 23 67 45 56 61.5 23:56,58:67
3 123 12 45 65 55 52.8 54:50.5,12:55
4 234 58 89 45 67 61.5 23:56,58:67
And to get your desired final output, just select and slice.
df %>%
mutate(score = rowMeans(subset(., select = c(3:4)), na.rm = TRUE)) %>%
group_by(id) %>%
mutate(overall = mean(score)) %>%
mutate(tally = paste(task, score, sep = ":", collapse = ",")) %>%
select(id, overall, tally) %>%
slice(1)
# A tibble: 1 x 3
id overall tally
<dbl> <dbl> <chr>
1 123 52.8 54:50.5,12:55
2 234 61.5 23:56,58:67

Related

Expand a data frame by group

I have a big data frame which consists of 1000 data frames (500x500), and I created by following code:
setwd("user/all_csv")
archivos <- list.files(full.names = F)
big.df <- lapply(archivos, read.csv, header = TRUE) %>%
set_names(archivos)%>%
bind_rows(.id = 'grp')
The big.df looks like below (a small example):
grp X X1 X2 X5
2020_01_19 1 23 47 3
2020_01_19 2 13 45 54
2020_01_19 5 23 41 21
2020_01_20 1 65 32 19
2020_01_20 2 39 52 12
2020_01_20 5 43 76 90
...
How can I generate the output below?:
1-X1 1-X2 1-X5 2-X1 2-X2 2-X5 5-X1 5-X2 5-X5
2020_01_19 23 47 3 13 45 54 23 41 21
2020_01_20 65 32 19 39 52 12 43 76 90
...
I don't really know how to proceed. Any help would be greatly appreciated.
use tidyr::pivot_wider with names_glue argument as follows.
Store name of all variables (even 500) to be pivoted into a vector say cols
Use values_from = all_of(cols) as argument in pivot_wider
cols <- c('X1', 'X2', 'X5')
df %>% pivot_wider(id_cols = grp, names_from = X, values_from = all_of(cols),
names_glue = '{X}-{.value}')
# A tibble: 2 x 10
grp `1-X1` `2-X1` `5-X1` `1-X2` `2-X2` `5-X2` `1-X5` `2-X5` `5-X5`
<chr> <int> <int> <int> <int> <int> <int> <int> <int> <int>
1 2020_01_19 23 13 23 47 45 41 3 54 21
2 2020_01_20 65 39 43 32 52 76 19 12 90
If you want to use all columns except first two, use this
df %>% pivot_wider(id_cols = grp, names_from = X, values_from = !c(grp, X),
names_glue = '{X}-{.value}')
# A tibble: 2 x 10
grp `1-X1` `2-X1` `5-X1` `1-X2` `2-X2` `5-X2` `1-X5` `2-X5` `5-X5`
<chr> <int> <int> <int> <int> <int> <int> <int> <int> <int>
1 2020_01_19 23 13 23 47 45 41 3 54 21
2 2020_01_20 65 39 43 32 52 76 19 12 90
However, if you want to rearrange columns as shown in expected outcome, you may use names_vary = 'slowest' in pivot_wider function of tidyr 1.2.0.

Using Dcast in R to transform dataframe

I have the following dataframe. And would like to get the desired output
data.frame(df)
num Name1 Result1 Name2 Result2 Name3 Result3
1 75%  74 100%  101 50%  50
2 75%  73 100%  101 50%  49
3 50% 50 100%  105 125% 128
I tried Dcast using the following
reshape2::dcast(df, num ~ Name1 + Name2 + Name3, value.var=c("Result1", "Result2", "Result3"))
The output from Dcast is close to my desired output but I would like only unique 'Name' values as my new columns. I can imagine that I can clean the table using aggregate before using Dcast but that seems excessive? I'm not sure if there's a faster way?
Desired output:
num 50% 75% 100% 125%
1 50 74 101 NA
2 49 73 101 NA
3 50 NA 100 128
I would appreciate any help
You can find more information on the steps here and here.
dat %>%
rename_at(vars(matches("[0-9]")),
~str_replace(.,"(\\d)(\\w*)","\\2_\\1")) %>%
pivot_longer(cols=matches("_"),names_to=c(".value","group"),
names_sep="_") %>%
dplyr::select(-group) %>%
pivot_wider(names_from = "Name",values_from="Result")
# A tibble: 3 x 5
num `75%` `100%` `50%` `125%`
<int> <int> <int> <int> <int>
1 1 74 101 50 NA
2 2 73 101 49 NA
3 3 NA 105 50 128
Alternately...
reshape(dat, idvar="num", direction="long",
varying=list(Name=c(2,4,6), Result=c(3,5,7)),
v.names = c("Name", "Result") ) %>%
dplyr::select(-time) %>%
dcast(num ~ Name)
num 50% 75% 100% 125%
1 1 50 74 101 NA
2 2 49 73 101 NA
3 3 50 NA 105 128
Get the data in long format so we have data in two columns Name and Result. We can then get the data in wide format.
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = -num,
names_to = '.value',
names_pattern = '([A-Za-z]+)\\d+') %>%
arrange(readr::parse_number(Name)) %>%
pivot_wider(names_from = Name, values_from = Result)
# num `50%` `75%` `100%` `125%`
# <int> <int> <int> <int> <int>
#1 1 50 74 101 NA
#2 2 49 73 101 NA
#3 3 50 NA 105 128

R Tibble: Arrange by two columns

Let's say I have the following data:
set.seed(123)
test <- tibble(
ID = sample(rep(1:100, rpois(100,4))),
vals = abs(round(rnorm(length(ID), 10000, 5000)))
)
I would like to sort test first by vals and then by ID with the desired output looking like this:
# A tibble: 409 x 2
ID vals
<int> <dbl>
1 48 26522
2 48 14427
3 48 7570
4 48 5922
5 92 25286
6 92 10436
7 92 5705
8 92 4036
9 92 3399
10 64 22190
# ... with 399 more rows
i.e. it should group the sorting by ID and then in decreasing order vals.
What I tried:
test %>% arrange(ID, desc(vals))
test %>% arrange(desc(vals), ID)
test %>% arrange(ID) %>% arrange(desc(vals))
I think you're missing some clarity:
sort first by each ID's maximum value, descending;
Try this:
library(dplyr)
test %>%
mutate(valrank = dense_rank(-vals)) %>%
group_by(ID) %>%
mutate(valrank = min(valrank)) %>%
ungroup() %>%
arrange(valrank, ID, desc(vals))
# # A tibble: 409 x 3
# ID vals valrank
# <int> <dbl> <int>
# 1 48 26522 1
# 2 48 14427 1
# 3 48 7570 1
# 4 48 5922 1
# 5 92 25286 2
# 6 92 10436 2
# 7 92 5705 2
# 8 92 4036 2
# 9 92 3399 2
# 10 64 22190 3
# # ... with 399 more rows
(I kept valrank just for demonstration.)

how to get name of variables instead of x when looping using map() functions in R?

I am applying a user defined function on numeric variables from a dataset but instead of getting their name's I am getting x when applied using map function. How do I replace x with variable name in map functions?
dataset: hd_trn
age sex cp trestbps chol fbs restecg thalach exang
<int> <fctr> <fctr> <int> <int> <fctr> <fctr> <int> <fctr>
63 1 1 145 233 1 2 150 0
67 1 4 160 286 0 2 108 1
67 1 4 120 229 0 2 129 1
37 1 3 130 250 0 0 187 0
41 0 2 130 204 0 2 172 0
56 1 2 120 236 0 0 178 0
user defined function to calculate high freq elements column wise
top_freq_elements <- function(x){
table(x) %>% as.data.frame() %>% top_n(5, Freq) %>% arrange(desc(Freq))
}
Applying function
hd_trn %>% select_if(is.numeric) %>% map(., .f = top_freq_elements)
######### output #########
x Freq
<fctr> <int>
54 51
58 43
55 41
56 38
57 38
desired: In the above output I am looking to get variable name instead of x
Tried reconstructing code below using imap but that is also not giving variable name:
hd_trn %>%
select_if(is.numeric) %>%
imap(function(feature_value, feature_name){
table(feature_value) %>%
as.data.frame() %>% #head()
rename(feature_name = feature_value) %>%
top_n(5, Freq) %>%
arrange(desc(Freq))
})
######### output #########
feature_name Freq
<fctr> <int>
54 51
58 43
55 41
56 38
57 38
You can rename the 1st column in each list :
library(dplyr)
library(purrr)
iris %>%
select(where(is.numeric)) %>%
imap(function(feature_value, feature_name){
table(feature_value) %>%
as.data.frame() %>%
rename_with(~feature_name, 1) %>%
slice_max(n = 5, Freq) %>%
arrange(desc(Freq))
})
This could be achieved using e.g. curly-curly {{ and := in rename like so:
top_freq_elements <- function(x){
table(x) %>% as.data.frame() %>% top_n(5, Freq) %>% arrange(desc(Freq))
}
library(dplyr)
library(purrr)
hd_trn %>%
select_if(is.numeric) %>%
imap(function(feature_value, feature_name){
table(feature_value) %>%
as.data.frame() %>% #head()
rename({{feature_name}} := feature_value) %>%
top_n(5, Freq) %>%
arrange(desc(Freq))
})
#> $age
#> age Freq
#> 1 67 2
#> 2 37 1
#> 3 41 1
#> 4 56 1
#> 5 63 1
#>
#> $sex
#> sex Freq
#> 1 1 5
#> 2 0 1
#>
#> $cp
#> cp Freq
#> 1 2 2
#> 2 4 2
#> 3 1 1
#> 4 3 1
#>
#> $trestbps
#> trestbps Freq
#> 1 120 2
#> 2 130 2
#> 3 145 1
#> 4 160 1

Get most frequently occurring factor level in dplyr piping structure

I'd like to be able to find the most frequently occurring level in a factor in a dataset while using dplyr's piping structure. I'm trying to create a new variable that contains the 'modal' factor level when being grouped by another variable.
This is an example of what I'm looking for:
df <- data.frame(cat = stringi::stri_rand_strings(100, 1, '[A-Z]'), num = floor(runif(100, min=0, max=500)))
df <- df %>%
dplyr::group_by(cat) %>%
dplyr::mutate(cat_mode = Mode(num))
Where "Mode" is a function that I'm looking for
Use table to count the items and then use which.max to find out the most frequent one:
df %>%
group_by(cat) %>%
mutate(cat_mode = names(which.max(table(num)))) %>%
head()
# A tibble: 6 x 3
# Groups: cat [4]
# cat num cat_mode
# <fctr> <dbl> <chr>
#1 Q 305 138
#2 W 34.0 212
#3 R 53.0 53
#4 D 395 5
#5 W 212 212
#6 Q 417 138
# ...
similar question to Is there a built-in function for finding the mode?
Mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
df %>%
group_by(cat) %>%
mutate(cat_mode = Mode(num))
# A tibble: 100 x 3
# Groups: cat [26]
cat num cat_mode
<fct> <dbl> <dbl>
1 S 25 25
2 V 86 478
3 R 335 335
4 S 288 25
5 S 330 25
6 Q 384 384
7 C 313 313
8 H 275 275
9 K 274 274
10 J 75 75
# ... with 90 more rows
To see for each factor
df %>%
group_by(cat) %>%
summarise(cat_mode = Mode(num))
A tibble: 26 x 2
cat cat_mode
<fct> <dbl>
1 A 480
2 B 380
3 C 313
4 D 253
5 E 202
6 F 52
7 G 182
8 H 275
9 I 356
10 J 75
# ... with 16 more rows

Resources