How to wide data from dataframe - r

I have data table whose look like this
I want this output

I would use:
library(dplyr)
data %>%
group_by(SubjectID) %>%
summarise_at(vars(everything()), ~paste0(unique(.), collapse = ",")) %>%
mutate_if(is.character, ~paste0("[", ., "]"))
Output is:
# A tibble: 1 × 4
SubjectID PunchLocation NumOfPunch PunchType
<dbl> <chr> <chr> <chr>
1 102 [1,2,3,4,5] [1,2] [5,6,9]
I used this data:
data <- tibble(
SubjectID = rep(102, 12),
PunchLocation = c(1, 1, 2, 2, 2, 3, 3, 4, 4, 5, 4, 4),
NumOfPunch = c(rep(1, 10), 2, 2),
PunchType = c(5, 6, 5, 6, 9, 5, 6, 5, 6, 9, 5, 6)
)

Using data.table (with example from #Stephan).
library(data.table)
setDT(data)
fun.collapse <- function(x) sprintf('[%s]', paste(unique(x), collapse=','))
dcast(data, SubjectID ~ .,
value.var = c('PunchLocation', 'NumOfPunch', 'PunchType'),
fun.aggregate = fun.collapse)
## SubjectID PunchLocation NumOfPunch PunchType
## 1: 102 [1,2,3,4,5] [1,2] [5,6,9]

Related

Using map() function to apply for each element

I need, with the help of the map() function, apply the above for each element
How can I do so?
As dt is of class data.table, you can make a vector of columns of interest (i.e. your items; below I use grepl on the names), and then apply your weighting function to each of those columns using .SD and .SDcols, with by
qs = names(dt)[grepl("^q", names(dt))]
dt[, (paste0(qs,"wt")):=lapply(.SD, \(q) 1/(sum(!is.na(q))/.N)),
.(sex, education_code, age), .SDcols = qs]
As mentioned in the comments, you miss a dt <- in your dt[, .(ID, education_code, age, sex, item = q1_1)] which makes the column item unavailable in the following line dt[, no_respond := is.na(item)].
Your weighting scheme is not absolutely clear to me however, assuming you want to do what is done in your code here, I would go with dplyr solution to iterate over columns.
# your data without no_respond column and correcting missing value in q2_3
dt <- data.table::data.table(
ID = c(1,2,3,4, 5, 6, 7, 8, 9, 10),
education_code = c(20,50,20,60, 20, 10,5, 12, 12, 12),
age = c(87,67,56,52, 34, 56, 67, 78, 23, 34),
sex = c("F","M","M","M", "F","M","M","M", "M","M"),
q1_1 = c(NA,1,5,3, 1, NA, 3, 4, 5,1),
q1_2 = c(NA,1,5,3, 1, 2, NA, 4, 5,1),
q1_3 = c(NA,1,5,3, 1, 2, 3, 4, 5,1),
q1_text = c(NA,1,5,3, 1, 2, 3, 4, 5,1),
q2_1 = c(NA,1,5,3, 1, 2, 3, 4, 5,1),
q2_2 = c(NA,1,5,3, 1, 2, 3, 4, 5,1),
q2_3 = c(NA,1,5,3, 1, NA, NA, 4, 5,1),
q2_text = c(NA,1,5,3, 1, NA, 3, 4, 5,1))
dt %>%
group_by(sex, education_code, age) %>% #groups the df by sex, education_code, age
add_count() %>% #add a column with number of rows in each group
mutate(across(starts_with("q"), #for each column starting with "q"
~ 1/(sum(!is.na(.))/n), #create a new column following your weight calculation
.names = '{.col}_wgt')) %>% #naming the new column with suffix "_wgt" to original name
ungroup()

Filter for Row from Multiple Data Frame in R

I have five data frames with the same dimension (8 by 2) but with different column names as follows:
nbb <- data.frame(
nbb_lb = c(2, 3, 4, 5, 6, 7, 8, 9),
nbb_RMSE = c(1.0152338, 0.7199394, 0.7990978, 0.9045563, 1.6514406, 0.5160516, 0.4964024, 0.2617795)
)
mbb <- data.frame(
mbb_lb = c(2, 3, 4, 5, 6, 7, 8, 9),
mbb_RMSE = c(0.8324074, 0.9278236, 1.9817984, 0.9567368, 0.2814623, 0.1129459, 0.1233126, 0.4222578)
)
cbb <- data.frame(
cbb_lb = c(2, 3, 4, 5, 6, 7, 8, 9),
cbb_RMSE = c(1.27782499, 1.96332220, 0.74704997, 0.46579943, 1.10850563, 0.40456698, 0.26027359, 0.02452239)
)
tmbb <- data.frame(
tmbb_lb = c(2, 3, 4, 5, 6, 7, 8, 9),
tmbb_RMSE = c(0.83240742, 1.05126826, 0.08290467, 0.76397988, 1.23772208, 0.57628337, 0.56437185, 0.46460279)
)
tcbb <- data.frame(
tcbb_lb = c(2, 3, 4, 5, 6, 7, 8, 9),
tcbb_RMSE = c(0.9328510, 0.8312332, 0.9402116, 1.6029357, 2.0001519, 0.4387557, 0.5965582, 0.4148854)
)
I want to create a new data frame that will contain each row that contains minimum RMSE in the five(5) data frames above. If I make the row names to be the name of different data frames as (nbb, mbb, cbb, tmbb, tcbb)
rownames(df) <- c("nbb", "mbb", "cbb", "tmbb", "tcbb")
I desire to have a result like the below:
df
lb
RMSE
nbb
9
0.2617795
mbb
7
0.1129459
cbb
9
0.02452239
tmbb
4
0.08290467
tcbb
8
0.4387557
Here is a base R way.
First put the data.frames in a list with mget, then lapply the list an anonymous function outputting the minimum RMSE row. Bind the output rows and reorder according to the wanted row names.
nbb <- data.frame(
nbb_lb = c(2, 3, 4, 5, 6, 7, 8, 9),
nbb_RMSE = c(1.0152338, 0.7199394, 0.7990978, 0.9045563, 1.6514406, 0.5160516, 0.4964024, 0.2617795)
)
mbb <- data.frame(
mbb_lb = c(2, 3, 4, 5, 6, 7, 8, 9),
mbb_RMSE = c(0.8324074, 0.9278236, 1.9817984, 0.9567368, 0.2814623, 0.1129459, 0.1233126, 0.4222578)
)
cbb <- data.frame(
cbb_lb = c(2, 3, 4, 5, 6, 7, 8, 9),
cbb_RMSE = c(1.27782499, 1.96332220, 0.74704997, 0.46579943, 1.10850563, 0.40456698, 0.26027359, 0.02452239)
)
tmbb <- data.frame(
tmbb_lb = c(2, 3, 4, 5, 6, 7, 8, 9),
tmbb_RMSE = c(0.83240742, 1.05126826, 0.08290467, 0.76397988, 1.23772208, 0.57628337, 0.56437185, 0.46460279)
)
tcbb <- data.frame(
tcbb_lb = c(2, 3, 4, 5, 6, 7, 8, 9),
tcbb_RMSE = c(0.9328510, 0.8312332, 0.9402116, 1.6029357, 2.0001519, 0.4387557, 0.5965582, 0.4148854)
)
df_list <- mget(ls(pattern = "bb$"))
tmp <- lapply(df_list, \(x){
i <- which.min(x[[2]])
if(length(i) > 0L) {
data.frame(lb = x[i, 1], RMSE = x[i, 2])
} else NULL
})
res <- do.call(rbind, tmp)
rm(tmp)
res <- cbind.data.frame(df = names(df_list), res)
i <- order(c("nbb", "mbb", "cbb", "tmbb", "tcbb"))
res <- res[i,]
res
#> df lb RMSE
#> nbb nbb 9 0.26177950
#> mbb mbb 7 0.11294590
#> cbb cbb 9 0.02452239
#> tmbb tmbb 4 0.08290467
#> tcbb tcbb 9 0.41488540
Created on 2022-04-10 by the reprex package (v2.0.1)
First combine your 5 data frames into a list, then use lapply to go through all your data frames and output the necessary information. Also, wrap the lapply() with a do.call to row bind (rbind) the results. Finally, change the result into a data frame and covert the rownames to column names
You can skip rownames_to_column("df") if you want to have row names instead of a column storing the dataframe names.
library(tibble)
df_list <- list(nbb = nbb, mbb = mbb, cbb = cbb, tmbb = tmbb, tcbb = tcbb)
do.call(rbind, lapply(df_list, function(x) data.frame(lb = x[which.min(x[,2]), 1], RMSE = min(x[, 2])))) %>%
rownames_to_column("df")
df lb RMSE
1 nbb 9 0.26177950
2 mbb 7 0.11294590
3 cbb 9 0.02452239
4 tmbb 4 0.08290467
5 tcbb 9 0.41488540
Input data
Moreover, note that the way you define dataframe in your post is not correct, I've included the "correct" way here for reference.
nbb <- data.frame(
nbb_lb = c(2, 3, 4, 5, 6, 7, 8, 9),
nbb_RMSE = c(1.0152338, 0.7199394, 0.7990978, 0.9045563, 1.6514406, 0.5160516, 0.4964024, 0.2617795))
mbb <- data.frame( mbb_lb = c(2, 3, 4, 5, 6, 7, 8, 9),
mbb_RMSE = c(0.8324074, 0.9278236, 1.9817984, 0.9567368, 0.2814623, 0.1129459, 0.1233126, 0.4222578)
)
cbb <- data.frame(
cbb_lb = c(2, 3, 4, 5, 6, 7, 8, 9),
cbb_RMSE = c(1.27782499, 1.96332220, 0.74704997, 0.46579943, 1.10850563, 0.40456698, 0.26027359, 0.02452239)
)
tmbb <- data.frame(
tmbb_lb = c(2, 3, 4, 5, 6, 7, 8, 9),
tmbb_RMSE = c(0.83240742, 1.05126826, 0.08290467, 0.76397988, 1.23772208, 0.57628337, 0.56437185, 0.46460279))
tcbb <- data.frame(
tcbb_lb = c(2, 3, 4, 5, 6, 7, 8, 9),
tcbb_RMSE = c(0.9328510, 0.8312332, 0.9402116, 1.6029357, 2.0001519, 0.4387557, 0.5965582, 0.4148854)
)
If you are open to a purrr approach, you could use
library(purrr)
library(dplyr)
my_list %>%
map_dfr(~.x %>%
filter(if_any(ends_with("_RMSE"), ~.x == min(.x))) %>%
rename_with(~gsub(".*_", "", .x)),
.id = "df") %>%
bind_rows()
this returns
df lb RMSE
1 cbb 9 0.02452239
2 mbb 7 0.11294590
3 nbb 9 0.26177950
4 tcbb 9 0.41488540
5 tmbb 4 0.08290467
with (borrowed from Rui Barradas)
my_list <- mget(ls(pattern = "bb$"))

How to define score of popularity for list of tags in R?

There is a dataset where each object has a list of tags of categories comma separated. I would like to have aggregated categories score per object based on categories' popularities. I can define the sum, min, and max of popularities but it's not clear to me how an aggregated score can be calculated.
library(tidyverse)
library(tibble)
library(stringr)
# 1. Data
df <- tribble(
~object, ~category,
1, "Software, Model, Cloud",
2, "Model",
3, "Cloud, Software",
4, "Train, Test, Model",
5, "Test, Model"
)
# 2. List of categories
list_category <- trimws(unlist(str_split(df$category, ",")))
# 3. Categories popularity
data.frame(category = list_category) %>%
group_by(category) %>%
summarise(n_count = n()) %>%
arrange(-n_count) %>%
ungroup()
# 4. Outcome with undefined 'score_category' feature that I'd like to know how to score
tribble(
~object, ~sum_category, ~min_category, ~max_category, ~score_category,
1, sum(c(2, 4, 2)), min(c(2, 4, 2)), max(c(2, 4, 2)), NA,
2, sum(c(4)), min(c(4)), max(c(4)), NA,
3, sum(c(2, 2)), min(c(2, 2)), max(c(2, 2)), NA,
4, sum(c(1, 2, 4)), min(c(1, 2, 4)), max(c(1, 2, 4)), NA,
5, sum(c(2, 4)), min(c(2, 4)), max(c(2, 4)), NA
)
Any ideas and code are welcome!

How to mutate columns in R based on ordering of subset of these columns?

To begin with, let's suppose we have a dataset like this:
data <- data.frame(
id = 1:5,
time = c(0.1, 0.2, 0.1, 0.1, 0.2),
obj_a_size = c(1, 3, 8, 4, 2),
obj_a_cuteness = c(3, 6, 4, 1, 2),
obj_b_size = c(5, 4, 4, 2, 5),
obj_b_cuteness = c(6, 2, 10, 9, 6),
obj_c_size = c(3, 6, 7, 1, 6),
obj_c_cuteness = c(10, 1, 6, 8, 8)
)
It has columns concerning whole experiment (like time) and object-specific columns (like X_size and X_cuteness). These objects are ordered randomly, though, so I'd like to mutate these column to order the objects by size for each experiment separately. The result I expect to be like that:
data <- data.frame(
id = 1:5,
time = c(0.1, 0.2, 0.1, 0.1, 0.2),
obj_max_size = c(5, 6, 8, 4, 6),
obj_max_cuteness = c(6, 1, 4, 1, 8),
obj_2nd_size = c(3, 4, 7, 2, 5),
obj_2nd_cuteness = c(10, 2, 6, 9, 6),
obj_min_size = c(1, 3, 3, 1, 2),
obj_min_cuteness = c(3, 6, 10, 8, 2)
)
Notice that cuteness isn't ordered descending or ascending, but I want cuteness to be considered part of an object and set obj_max_cuteness = obj_2_cuteness wherever obj_max_size = obj_2_size, and so on.
Number of objects is known in advance (there are four of them), columns are known as well, and there are four columns describing each object. There is no missing data. I'm willing to use any package, if necessary. Also, original dataset is about 500k by 30, so bonus points for quick or memory-friendly code.
EDIT: Some noticed that the description is not very clear. What I'm after is a bit object-oriented thing: in the case above each object within experiment could be described as such (X in obj_X_ means that it belongs to experiment no. X):
obj_1_a = {"size": 1, "cuteness": 3}
obj_1_b = {"size": 5, "cuteness": 6}
obj_1_c = {"size": 3, "cuteness": 10}
obj_2_a = {"size": 3, "cuteness": 6}
...
I want to reorder them by size so that (in the resulting data frame):
obj_1_max = {"size": 5, "cuteness": 6}
obj_1_2nd = {"size": 3, "cuteness": 10}
obj_1_min = {"size": 1, "cuteness": 3}
obj_2_max = {"size": 6, "cuteness": 1}
...
Is this what you are after?
The min and max value calculations are straightforward. To find the 2nd max you need to do a bit more work. My interpretation of the 2nd values is that it is the 2nd value of the sorted and unique values. My output differs from yours but that may be due to a different interpretation of what you mean by the 2nd value. My reading: you are looking for the first value down from the max value; from the groups of 3 columns (size, cuteness).
library(dplyr)
data <- data.frame(
id = 1:5,
time = c(0.1, 0.2, 0.1, 0.1, 0.2),
obj_a_size = c(1, 3, 8, 4, 2),
obj_a_cuteness = c(3, 6, 4, 1, 2),
obj_b_size = c(5, 4, 4, 2, 5),
obj_b_cuteness = c(6, 2, 10, 9, 6),
obj_c_size = c(3, 6, 7, 1, 6),
obj_c_cuteness = c(10, 1, 6, 8, 8)
)
obj_max_size <- data %>%
pivot_longer(cols = contains('size')) %>%
group_by(id) %>%
summarise(obj_max_size = max(value)) %>%
ungroup() %>%
select(obj_max_size)
obj_min_size <- data %>%
pivot_longer(cols = contains('size')) %>%
group_by(id) %>%
summarise(obj_min_size = min(value)) %>%
ungroup() %>%
select(obj_min_size)
obj_2nd_size <- data %>%
pivot_longer(cols = contains('size')) %>%
group_by(id) %>%
distinct(value) %>%
arrange(desc(value)) %>%
slice(2) %>%
ungroup() %>%
select(obj_2nd_size = value)
obj_max_cuteness <- data %>%
pivot_longer(cols = contains('cuteness')) %>%
group_by(id) %>%
summarise(obj_max_cuteness = max(value)) %>%
ungroup() %>%
select(obj_max_cuteness)
obj_min_cuteness <- data %>%
pivot_longer(cols = contains('cuteness')) %>%
group_by(id) %>%
summarise(obj_min_cuteness = min(value)) %>%
ungroup() %>%
select(obj_min_cuteness)
obj_2nd_cuteness <- data %>%
pivot_longer(cols = contains('cuteness')) %>%
group_by(id) %>%
distinct(value) %>%
arrange(desc(value)) %>%
slice(2) %>%
ungroup() %>%
select(obj_2nd_cuteness = value)
output <- bind_cols(id = data$id, obj_max_size, obj_min_size, obj_2nd_size, obj_max_cuteness, obj_min_cuteness, obj_2nd_cuteness)
With output looking like this:
> output
# A tibble: 5 x 7
id obj_max_size obj_min_size obj_2nd_size obj_max_cuteness obj_min_cuteness obj_2nd_cuteness
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 5 1 3 10 3 6
2 2 6 3 4 6 1 2
3 3 8 4 7 10 4 6
4 4 4 1 2 9 1 8
5 5 6 2 5 8 2 6

how to add together dataframes within a list but only for matching dates

I have a list of dataframes that I want to consolidate these dataframes into one data frame. I am looking to solve two problems:
How to add together the columns
How to only include common dates across all the dfs withing the list
This is what I have:
library(tidyverse)
library(lubridate)
df1 <- data.frame(
date = ymd(c("2019-02-01", "2019-02-02", "2019-02-03", "2019-02-04",
"2019-02-05")),
x = c(1, 2, 3, 4, 5),
y = c(2, 3, 4, 5, 6),
z = c(3, 4, 5, 6, 7)
)
df2 <- data.frame(
date = ymd(c("2019-02-01", "2019-02-02", "2019-02-04", "2019-02-05")),
x = c(1, 2, 3, 4),
y = c(2, 3, 4, 5),
z = c(3, 4, 5, 6)
)
df3 <- data.frame(
date = ymd(c("2019-02-01", "2019-02-02", "2019-02-03", "2019-02-04")),
x = c(1, 2, 3, 4),
y = c(2, 3, 4, 5),
z = c(3, 4, 5, 6)
)
dfl <- list(df1, df2, df3)
This is the output I am looking for:
data.frame(
date = ymd(c("2019-02-01", "2019-02-02", "2019-02-04")),
x = c(3, 6, 11),
y = c(6, 9, 14),
z = c(9, 12, 17)
)
I have tried inner_join and tried looping through the list but it got too complicated and I still didn't manage to land on the answer.
Is there a more cleaner way to get to the final answer
How about this?
bind_rows(dfl) %>%
group_by(date) %>%
mutate(n = 1) %>%
summarise_all(sum) %>%
filter(n == length(dfl)) %>%
select(-n)
## A tibble: 3 x 4
# date x y z
# <date> <dbl> <dbl> <dbl>
#1 2019-02-01 3 6 9
#2 2019-02-02 6 9 12
#3 2019-02-04 11 14 17
This assumes that there are no duplicate dates in a single data.frame of dfl.

Resources