Spread a dataframe and relabel column names

Spread a dataframe and relabel column names - r

Probably a tidyr or a reshape2 question.
I have this type of a data.frame:
df <- data.frame(group = c("A","B","C","D"),
id1 = c("AV14D","BV29",NA,NA),
id2 = c(NA,"BD1",NA,NA),
id3 = c("AJ31","BJ1",NA,NA),
n = c(2,4,NA,NA),
stringsAsFactors = F)
which has a row with 3 IDs and a count for each group.
I want to transform it to a single row data.frame with #groups x 4 (id1,id2,id3,n) columns, where each group has 4 columns: <group>_id1, <group>_id2, <group>_id3, and <group>_n and the corresponding values.
So the resulting data.frame will be:
data.frame(A_id1 = "AV14D",A_id2 = NA,A_id3 = "AJ31",A_n = 2,
B_id1 = "BV29",B_id2 = "BD1",B_id3 = "BJ1",B_n = 4,
C_id1 = NA,C_id2 = NA,C_id3 = NA,C_n = NA,
D_id1 = NA,D_id2 = NA,D_id3 = NA,D_n = NA,
stringsAsFactors = F)

We may use pivot_wider after creating a row index column
library(dplyr)
library(tidyr)
library(stringr)
out1 <- df %>%
mutate(rn = 1) %>%
pivot_wider(names_from = group, values_from = id1:n,
names_glue = "{group}_{.value}") %>%
select(order(str_remove(names(.), "_.*")), -rn) %>%
type.convert(as.is = TRUE)
-checking with OP's output
> all.equal(out, out1, check.attributes = FALSE)
[1] TRUE

Related

New column with random boolean values while controlling the ratio of TRUE/FALSE per category

In R I've got a dataset like this one:
df <- data.frame(
ID = c(1:30),
x1 = seq(0, 1, length.out = 30),
x2 = seq(100, 3000, length.out = 30),
category = gl(3, 10, labels = c("NEGATIVE", "NEUTRAL", "POSITIVE"))
)
Now I want to add a new column with randomized boolean values, but inside each category the proportion of TRUE and FALSE values should be the same (i.e. the randomizing process should generate the same count of true and false values, in the above data frame 5 TRUEs and 5 FALSEs in each of the 3 categories). How to do this?

You can sample a vector of "TRUE" and "FALSE" values without replacement so you have a randomized and balanced column in your data-frame.
sample(rep(c("TRUE","FALSE"),each=5),10,replace=FALSE)

Based on Yacine Hajji answer:
addRandomBool <- function(df, p){
n <- ceiling(nrow(df) * p)
df$bool <- sample(rep(c("TRUE","FALSE"), times = c(n, nrow(df) - n)))
df
}
Reduce(rbind, lapply(split(df, df$category), addRandomBool, p = 0.5))
where parametar p determines the proportion of TRUE.

This will sample within each group from a vector of 5 TRUE and 5 FALSE without replacement. It will assume that there are always 10 records per group.
library(dplyr)
library(tidyr)
df <- data.frame(
ID = c(1:30),
x1 = seq(0, 1, length.out = 30),
x2 = seq(100, 3000, length.out = 30),
category = gl(3, 10, labels = c("NEGATIVE", "NEUTRAL", "POSITIVE"))
)
set.seed(pi)
df %>%
group_by(category) %>%
nest() %>%
mutate(data = lapply(data,
function(df){ # Function to saple and assign the new_col
df$new_col <- sample(rep(c(FALSE, TRUE),
each = 5),
size = 10,
replace = FALSE)
df
})) %>%
unnest(cols = "data")
This next example is a little more generalized, but still assumes (approximately) even distribution of TRUE and FALSE within a group. But it can accomodate variable group sizes, and even groups with odd numbers of records (but will favor FALSE for odd numbers of records)
library(dplyr)
library(tidyr)
df <- data.frame(
ID = c(1:30),
x1 = seq(0, 1, length.out = 30),
x2 = seq(100, 3000, length.out = 30),
category = gl(3, 10, labels = c("NEGATIVE", "NEUTRAL", "POSITIVE"))
)
set.seed(pi)
df %>%
group_by(category) %>%
nest() %>%
mutate(data = lapply(data,
function(df){
df$new_col <- sample(rep(c(FALSE, TRUE),
length.out = nrow(df)),
size = nrow(df),
replace = FALSE)
df
})) %>%
unnest(cols = "data")
Maintaining Column Order
A couple of options to maintain the column order:
First, you can save the column order before you do your group_by - nest, and then use select to set the order when you're done.
set.seed(pi)
orig_col <- names(df) # original column order
df %>%
group_by(category) %>%
nest() %>%
mutate(data = lapply(data,
function(df){
df$new_col <- sample(rep(c(FALSE, TRUE),
length.out = nrow(df)),
size = nrow(df),
replace = FALSE)
df
})) %>%
unnest(cols = "data") %>%
select_at(c(orig_col, "new_col")) # Restore the column order
Or you can use a base R solution that doesn't change the column order in the first place
df <- split(df, df["category"])
df <- lapply(df,
function(df){
df$new_col <- sample(rep(c(FALSE, TRUE),
length.out = nrow(df)),
size = nrow(df),
replace = FALSE)
df
})
do.call("rbind", c(df, list(make.row.names = FALSE)))
There are likely a dozen other ways to do this, and probably more efficient ways that I'm not thinking of.

Vectorization to extract and bind very nested data

I have some very nested data. Within my list-column-dataframes, there are some pieces I need to put together and I've done so in a single instance to get my desired dataframe:
a <- df[[2]][["result"]]#data
b <- df[[2]][["result"]]#coords
desired_df <- cbind(a, b)
My original Large list has 171 elements, meaning I have 1:171 (3.3 GB) to go inside those square brackets and would ideally end up with 171 desired dataframes (which I would then bind all together).
I haven't needed to write a loop in 10 years, but I don't see a tidyverse way to deal with this. I also no longer know how to write loops. There are definitely some elements in there that are junk and will fail.

You haven't provided any sort of minimal example of the data.
I've condensed it to mean something like this
base_data <- data.frame(group = c("a", "b", "c"), var1 = c(3, 1, 2),
var2 = c( 2, 4, 8))
base_data2 = matrix(
c(1, 2, 3, 4, 5, 6, 7, 8, 9),
nrow = 3,
ncol = 3,
byrow = TRUE
)
rownames(base_data2) = c("d", "e", "f")
methods::setClass(
"weird_object",
slots = c(data = "data.frame", coords = "matrix"),
prototype = list(data = base_data, coords = base_data2)
)
df <- list(
list(
result = new("weird_object")
),list(
result = new("weird_object")
),list(
result = new("weird_object")
),list(
result = new("weird_object")
)
)
And if I had such a list with these objects, then I could do
df %>%
map(. %>% {
list(data = .$result#data,
cooords = .$result#coords)
}) %>%
enframe() %>%
unnest_wider(value)
But the selecting / hoisting function might fail, thus
one can wrap it in a purrr::possibly, and
choose a reasonable default:
df %>%
map(possibly(. %>% {
list(data = .$result#data,
cooords = .$result#coords)
},
otherwise = list(data = NA, coords = NA))) %>%
enframe() %>%
unnest_wider(value)
Hopefully, this could be a step forward.
Next step is probably something resembling this:
df %>%
map(. %>% {
list(data = .$result#data,
coords = .$result#coords)
}) %>%
enframe() %>%
unnest_wider(value) %>%
mutate(coords = coords %>% map(. %>% as_tibble(rownames = "rowid"))) %>%
unnest(cols = c(data, coords)) %>%
#' rotating the thing now
pivot_longer(cols = c(group, rowid),
names_to = "var_name",
values_to = "var") %>%
select(-var_name) %>%
pivot_longer(cols = c(var1, var2, V1, V2, V3),
names_to = "var_name") %>%
pivot_wider(names_from = var, values_from = value) %>%
identity()

If I understand your data structure, which I probably don't, you could do:
library(tidyverse)
# Create dummy data
df <- mtcars
df$mpg <- list(result = I(list('test')))
df$mpg$result <- list("#data" = I(list('your data')))
df <- df %>% select(mpg, cyl)
df1 <- df
df2 <- df
# Pull data you're interested in.
# The index is 1 here, instead of 2, because it's fake data and not your data.
# Assuming the # is not unique, and is just parsed from JSON or some other format.
dont_at_me <- function(x){
a <- x[[1]][["result"]][["#data"]]
a
}
# Get a list of all of your data.frames
all_dfs <- Filter(function(x) is(x, "data.frame"), mget(ls()))
# Vectorize
purrr::map(all_dfs, ~dont_at_me(.))

Group dataframe row and column wise based on other dataframe?

I have a dataframe that I would like to group in both directions, first rowise and columnwise after. The first part worked well, but I am stuck with the second one. I would appreciate any help or advice for a solution that does both steps at the same time.
This is the dataframe:
df1 <- data.frame(
ID = c(rep(1,5),rep(2,5)),
ID2 = rep(c("A","B","C","D","E"),2),
A = rnorm(10,20,1),
B = rnorm(10,50,1),
C = rnorm(10,10,1),
D = rnorm(10,15,1),
E = rnorm(10,5,1)
)
This is the second dataframe, which holds the "recipe" for grouping:
df2 <- data.frame (
Group_1 = c("B","C"),
Group_2 = c("D","A"),
Group_3 = ("E"), stringsAsFactors = FALSE)
Rowise grouping:
df1_grouped<-bind_cols(df1[1:2], map_df(df2, ~rowSums(df1[unique(.x)])))
Now i would like to apply the same grouping to the ID2 column and sum the values in the other columns. My idea was to mutate a another column (e.g. "group", which contains the name of the final group of ID2. After this i can use group_by() and summarise() to calculate the sum for each. However, I can't figure out an automated way to do it
bind_cols(df1_grouped,
#add group label
data.frame(
group = rep(c("Group_2","Group_1","Group_1","Group_2","Group_3"),2))) %>%
#remove temporary label column and make ID a character column
mutate(ID2=group,
ID=as.character(ID))%>%
select(-group) %>%
#summarise
group_by(ID,ID2)%>%
summarise_if(is.numeric, sum, na.rm = TRUE)
This is the final table I need, but I had to manually assign the groups, which is impossible for big datasets

I will offer such a solution
library(tidyverse)
set.seed(1)
df1 <- data.frame(
ID = c(rep(1,5),rep(2,5)),
ID2 = rep(c("A","B","C","D","E"),2),
A = rnorm(10,20,1),
B = rnorm(10,50,1),
C = rnorm(10,10,1),
D = rnorm(10,15,1),
E = rnorm(10,5,1)
)
df2 <- data.frame (
Group_1 = c("B","C"),
Group_2 = c("D","A"),
Group_3 = ("E"), stringsAsFactors = FALSE)
df2 <- df2 %>% pivot_longer(everything())
df1 %>%
pivot_longer(-c(ID, ID2)) %>%
mutate(gr_r = df2$name[match(ID2, table = df2$value)],
gr_c = df2$name[match(name, table = df2$value)]) %>%
arrange(ID, gr_r, gr_c) %>%
pivot_wider(c(ID, gr_r), names_from = gr_c, values_from = value, values_fn = list(value = sum))

Combine rows with duplicate identifiers while adding additional columns

Here's a simple example of what I'm looking for:
Before:
data.frame(
Name = c("pusheen", "pusheen", "puppy"),
Species = c("feline", "feline", "doggie"),
Activity = c("snacking", "napping", "playing"),
Start = c(1, 2, 3),
End = c(11, 12, 13)
)
After:
data.frame(
Name = c("pusheen", "puppy"),
Species = c("feline", "doggie"),
Activity1 = c("snacking", "playing"),
Start1 = c(1, 3),
End1 = c(11, 13),
Activity2 = c("napping", NA),
Start2 = c(2, NA),
End2 = c(12, NA)
)
How do I do this in R or Excel? Thanks!

This can be done using pivot_wider from the tidyr package.
library(tidyr)
library(dplyr)
library(magrittr)
df <- df %>%
group_by(Name) %>%
mutate(num = row_number()) %>% # Create a counter by group
ungroup() %>%
pivot_wider(
id_cols = c("Name", "Species"),
names_from = num,
values_from = c("Activity", "Start", "End"),
names_sep = "")
If you want the result ordered as in your sample output, we can add an additional select statement. I used str_sub from the stringr package to pull out the last character from each column name, and then sorted the names from there. This method of ordering columns should generalise to any number of activities.
library(stringr)
df %>%
select(Name, Species, names(df)[order(str_sub(names(df), -1))])

Find max of rows from specific columns and extract column name and corresponding row value from another column

Here is a data structure that I have:
structure(list(UDD_beta = c(1.17136554204268, 0.939587997289016
), UDD_pval = c(0, 0), UDD_R.sq = c(0.749044972637797, 0.516943886705951
), SSX_beta = c(1.05356804780772, 0.927948300464624), SSX_pval = c(0,
0), SSX_R.sq = c(0.60226298037862, 0.629111666509209), SPP_beta = c(0.675765151939885,
0.516425218613404), SPP_pval = c(0, 0), SPP_R.sq = c(0.479849538274406,
0.378266618442121), EEE_beta = c(0.690521022226874, 0.639380962824289
), EEE_pval = c(0, 0), EEE_R.sq = c(0.585610742768951, 0.676073352909597
)), .Names = c("UDD_beta", "UDD_pval", "UDD_R.sq", "SSX_beta",
"SSX_pval", "SSX_R.sq", "SPP_beta", "SPP_pval", "SPP_R.sq",
"EEE_beta", "EEE_pval", "EEE_R.sq"), row.names = c("DDK", "DDL"
), class = "data.frame")
I want to take R.sq columns and for each row find the max and the column name of the max value. Then take corresponding beta. Expected output:
Name Value
DDK UDD 1.17136554204268
DDL EEE 0.690521022226874
Sorry, the second expected value should be 0.639380962824289.

We could use max.col. Subset the columns of interest i.e. columns that have 'R.sq' using the grep, then get the column index of max value with max.col. Use that to get the column names and also the values that correspond to a particular row (row/column indexing)
i1 <- grep("R.sq", names(df1))
i2 <- max.col(df1[i1], "first")
i3 <- grep("beta", names(df1))
res <- data.frame(Names = sub("_.*", "", names(df1)[i1][i2]),
Value = df1[i3][cbind(1:nrow(df1), i2)])
row.names(res) <- row.names(df1)

sub_data <- data[grep("R.sq", colnames(data))]
colnames(sub_data) <- gsub("_R.sq", "", colnames(sub_data))
sub_data$Name <- NA
sub_data$Value <- NA
for (i in 1:nrow(sub_data)){
sub_data$Name[i] <- names(sub_data[i,])[which.max(apply(sub_data[i,], 2, max))]
sub_data$Value[i] <- max(data[grep(paste0(sub_data$Name[i], "_beta"), colnames(data))], na.rm=T)
}
sub_data[c("Name", "Value")]
# Name Value
#DDK UDD 1.171366
#DDL EEE 0.690521

You can use a tidyverse approach via gathering your df to long and filtering both R.sq vars and max value, i.e.
library(tidyverse)
df %>%
rownames_to_column('ID') %>%
gather(var, val, -ID) %>%
filter(grepl('R.sq|beta', var)) %>%
group_by(ID) %>%
mutate(max1=as.integer(val == max(val[grepl('R.sq', var)]))) %>%
group_by(ID, grp = sub('_.*', '', var)) %>%
filter(!all(max1 == 0) & grepl('beta', var)) %>%
ungroup() %>% select(-c(max1, grp))
which gives,
# A tibble: 2 x 3
ID var val
<chr> <chr> <dbl>
1 DDK UDD_beta 1.171366
2 DDL EEE_beta 0.639381

# Need ID for all possible betas and Rsq
ID <- gsub("_R.sq", "", grep("_R.sq$", names(INPUT), value = TRUE))
dummy <- function(x) {
# Find out which Rsq is largest
i <- ID[which.max(x[paste0(ID, "_R.sq")])]
# Extract beta for largest Rsq
data.frame(Name = i, Value = x[paste0(i, "_beta")])
}
do.call("rbind", apply(INPUT, 1, dummy))

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Spread a dataframe and relabel column names - r

Related

New column with random boolean values while controlling the ratio of TRUE/FALSE per category

Vectorization to extract and bind very nested data

Group dataframe row and column wise based on other dataframe?

Combine rows with duplicate identifiers while adding additional columns

Find max of rows from specific columns and extract column name and corresponding row value from another column

Categories

Resources