Trying to pivot a table in R - r

I am trying to pivot a table in R for example:
To
Code to create the starting table:
df <- data.frame (ID = c("A","A","A","B","B","C"),
Dates = c("01/01/2021", "10/02/2021", "30/03/2021","04/04/2021","06/05/2021","20/06/2021"))
Assume there is a max of three dates for the above example.

You are missing a column with the identifier "Date1", "Date2", "Date3". You can create it with mutate(), then use pivot_wider() from the tidyverse library.
dt <- data.frame (ID = c("A","A","A","B","B","C"),
Dates = c("01/01/2021", "10/02/2021", "30/03/2021","04/04/2021","06/05/2021","20/06/2021"))
library(tidyverse)
dt %>% group_by(ID) %>%
mutate(col = paste0("Date",row_number())) %>%
pivot_wider(id_cols = ID, names_from = col, values_from = Dates)

This is my approach :
my_df <- data.frame (ID = c("A","A","A","B","B","C"),
Dates = c("01/01/2021", "10/02/2021", "30/03/2021","04/04/2021","06/05/2021","20/06/2021"),
stringsAsFactors = FALSE)
my_df <- my_df %>% group_by(ID) %>% mutate(value = paste("Date", seq_along(ID), sep = ""))
my_df <- dcast(my_df, ID ~ value, value.var = "Dates")

Here's an approach similar to what you're requesting.
library("maditr")
df <- dcast(df, Dates ~ ID,fun.aggregate = length)

Another solution, using data.table
df <- data.frame(
ID = c("A","A","A","B","B","C"),
Dates = c("01/01/2021", "10/02/2021", "30/03/2021","04/04/2021","06/05/2021","20/06/2021")
)
library(data.table)
setDT(df)
df <- df[, .(dates = lapply(.SD, function(x) paste(x, collapse = ", "))), by = ID, .SDcols = c("Dates")]
df[, c("Date1", "Date2", "Date3") := tstrsplit(dates, ", ")]
df[, dates := NULL]
df
# ID Date1 Date2 Date3
# 1: A 01/01/2021 10/02/2021 30/03/2021
# 2: B 04/04/2021 06/05/2021 <NA>
# 3: C 20/06/2021 <NA> <NA>

A base R option using reshape
reshape(
transform(
df,
q = ave(seq_along(ID), ID, FUN = seq_along)
),
direction = "wide",
idvar = "ID",
timevar = "q"
)
gives
ID Dates.1 Dates.2 Dates.3
1 A 01/01/2021 10/02/2021 30/03/2021
4 B 04/04/2021 06/05/2021 <NA>
6 C 20/06/2021 <NA> <NA>

Related

Calculating row means and saving them in a new column in R (data table)

I have the following data table:
library(dplyr)
set.seed(123)
dt <- data.table(date = seq(as.Date('2020-01-01'), by = '1 day', length.out = 365),
Germany = rnorm(365, 2, 1), check.names = FALSE)
dt <- dt %>%
mutate(month = format(date, '%b'),
date = format(date, '%d')) %>%
tidyr::pivot_wider(names_from = date, values_from = Germany)
I would like to add two new columns (monthlyAverage, quarterlyAverage), one containing the monthly averages and the other column the quarterly averages.
For monthly average you can take rowwise mean, for quaterly average you can create groups of 3 rows and take mean of every 3 months.
library(dplyr)
dt %>%
mutate(monthlyaverage = rowMeans(.[-1], na.rm = TRUE)) %>%
group_by(grp = ceiling(row_number()/3)) %>%
mutate(quaterlyaverage = mean(monthlyaverage)) %>%
select(month, grp, monthlyaverage, quaterlyaverage, everything())
If you want to do this using data.table :
library(data.table)
setDT(dt)[, monthlyaverage := rowMeans(.SD, na.rm = TRUE), .SDcols = -1]
dt[, quaterlyaverage := mean(monthlyaverage), ceiling(seq_len(nrow(dt))/3)]

Group dataframe row and column wise based on other dataframe?

I have a dataframe that I would like to group in both directions, first rowise and columnwise after. The first part worked well, but I am stuck with the second one. I would appreciate any help or advice for a solution that does both steps at the same time.
This is the dataframe:
df1 <- data.frame(
ID = c(rep(1,5),rep(2,5)),
ID2 = rep(c("A","B","C","D","E"),2),
A = rnorm(10,20,1),
B = rnorm(10,50,1),
C = rnorm(10,10,1),
D = rnorm(10,15,1),
E = rnorm(10,5,1)
)
This is the second dataframe, which holds the "recipe" for grouping:
df2 <- data.frame (
Group_1 = c("B","C"),
Group_2 = c("D","A"),
Group_3 = ("E"), stringsAsFactors = FALSE)
Rowise grouping:
df1_grouped<-bind_cols(df1[1:2], map_df(df2, ~rowSums(df1[unique(.x)])))
Now i would like to apply the same grouping to the ID2 column and sum the values in the other columns. My idea was to mutate a another column (e.g. "group", which contains the name of the final group of ID2. After this i can use group_by() and summarise() to calculate the sum for each. However, I can't figure out an automated way to do it
bind_cols(df1_grouped,
#add group label
data.frame(
group = rep(c("Group_2","Group_1","Group_1","Group_2","Group_3"),2))) %>%
#remove temporary label column and make ID a character column
mutate(ID2=group,
ID=as.character(ID))%>%
select(-group) %>%
#summarise
group_by(ID,ID2)%>%
summarise_if(is.numeric, sum, na.rm = TRUE)
This is the final table I need, but I had to manually assign the groups, which is impossible for big datasets
I will offer such a solution
library(tidyverse)
set.seed(1)
df1 <- data.frame(
ID = c(rep(1,5),rep(2,5)),
ID2 = rep(c("A","B","C","D","E"),2),
A = rnorm(10,20,1),
B = rnorm(10,50,1),
C = rnorm(10,10,1),
D = rnorm(10,15,1),
E = rnorm(10,5,1)
)
df2 <- data.frame (
Group_1 = c("B","C"),
Group_2 = c("D","A"),
Group_3 = ("E"), stringsAsFactors = FALSE)
df2 <- df2 %>% pivot_longer(everything())
df1 %>%
pivot_longer(-c(ID, ID2)) %>%
mutate(gr_r = df2$name[match(ID2, table = df2$value)],
gr_c = df2$name[match(name, table = df2$value)]) %>%
arrange(ID, gr_r, gr_c) %>%
pivot_wider(c(ID, gr_r), names_from = gr_c, values_from = value, values_fn = list(value = sum))

removing groups with a certain NA number

Sorry to bother with a relatively simple question perhaps.
I have this type of dataframe:
A long list of names in the column "NAME" c(a, b, c, d, e ...) , two potential classes in the column "SURNAME" c(A, B) and a third column containing values.
I want to remove all NAMES for which at least in one of the SURNAME classes I have more than 2 "NA" in the VALUE column.
I wanted to post an example dataset but I am struggling to format it properly
I was trying to use
df <- df %>%
group_by(NAME) %>%
group_by(SURNAME) %>%
filter(!is.na(VALUE)) %>%
filter(length(VALUE)>=3)
it does not throw an error but I have the impression that something is wrong. Any suggestion? Many thanks
Let's create a dataset to work with:
set.seed(1234)
df <- data.frame(
name = sample(x=letters, size=1e3, replace=TRUE),
surname = sample(x=c("A", "B"), size=1e3, replace=TRUE),
value = sample(x=c(1:10*10,NA), size=1e3, replace=TRUE),
stringsAsFactors = FALSE
)
Here's how to do it with Base R:
# count NAs by name-surname combos (na.action arg is important!)
agg <- aggregate(value ~ name + surname, data=df, FUN=function(x) sum(is.na(x)), na.action=NULL)
# rename is count of NAs column
names(agg)[3] <- "number_of_na"
#add count of NAs back to original data
df <- merge(df, agg, by=c("name", "surname"))
# subset the original data
result <- df[df$number_of_na < 3, ]
Here's how to do it with data.table:
library(data.table)
dt <- as.data.table(df)
dt[ , number_of_na := sum(is.na(value)), by=.(name, surname)]
result <- dt[number_of_na < 3]
Here's how to do it with dplr/tidyverse:
library(dplyr) # or library(tidyverse)
result <- df %>%
group_by(name, surname) %>%
summarize(number_of_na = sum(is.na(value))) %>%
right_join(df, by=c("name", "surname")) %>%
filter(number_of_na < 3)
After grouping by 'NAME', 'SURNAME', create a column with the number of NA elements in that group and then filter out any 'NAME' that have an 'ind' greater than or equal to 3
df %>%
group_by(NAME, SURNAME) %>%
mutate(ind = sum(is.na(VALUE))) %>%
group_by(NAME) %>%
filter(!any(ind >=3)) %>%
select(-ind)
Or do an anti_join after doing the filtering by 'NAME', 'SURNAME' based on the condition
df %>%
group_by(NAME, SURNAME) %>%
filter(sum(is.na(VALUE))>=3) %>%
ungroup %>%
distinct(NAME) %>%
anti_join(df, .)
data
set.seed(24)
df <- data.frame(NAME = rep(letters[1:5], each = 20),
SURNAME = sample(LETTERS[1:4], 5 * 20, replace = TRUE),
VALUE = sample(c(NA, 1:3), 5 *20, replace = TRUE),
stringsAsFactors = FALSE)

Factors in many columns to boolean convert

My question is a little like this but the problem with the structure of data is different:
Sample data:
df <-data.frame(id = c(1,2,3), stock_1 = c("Google","Microsoft","Yahoo"), stock_2 = c("Yahoo","Google","NA"))
and I would like to convert to this:
df <-data.frame(id = c(1,2,3), Google = c(1,1,0), Microsoft = c(0,1,0), Yahoo= c(1,0,1))
I tried to use sapply() but from the answer to the linked question it is only for one column.
Here is a way to do it with data.table:
library(data.table)
setDT(df)
dcast(melt(df, id = 'id')[value != 'NA'],
id ~ value, fun.aggregate = length)
# id Google Microsoft Yahoo
# 1: 1 1 0 1
# 2: 2 1 1 0
# 3: 3 0 0 1
fill = 0 is unnecessary, and to tolerate duplicates, we can try:
dcast(melt(df, id = 'id')[value != 'NA'],
id ~ value, fun.aggregate = function(x){ 1 * (length(x) != 0)})
2017-01-01
As mentationed by Uwe, we can removed from NAs from the molten data by setting na.rm = TRUE if it is not hard coded as a string ("NA"), the commands finally looks this:
dcast(melt(df, id = 'id', na.rm = TRUE), id ~ value, fun.aggregate = length)
# or
dcast(melt(df, id = 'id', na.rm = TRUE),
id ~ value, fun.aggregate = function(x){ 1 * (length(x) != 0)})
We can also do this with tidyverse
library(tidyverse)
df %>%
gather(key, val, -id) %>%
filter(!is.na(val)) %>%
mutate(ind = 1) %>%
select(-key) %>%
spread(val, ind, fill = 0)
NOTE: It is better to use NA instead of "NA" as we can take care of NA with is.na or na.omit or complete.cases

Find max of rows from specific columns and extract column name and corresponding row value from another column

Here is a data structure that I have:
structure(list(UDD_beta = c(1.17136554204268, 0.939587997289016
), UDD_pval = c(0, 0), UDD_R.sq = c(0.749044972637797, 0.516943886705951
), SSX_beta = c(1.05356804780772, 0.927948300464624), SSX_pval = c(0,
0), SSX_R.sq = c(0.60226298037862, 0.629111666509209), SPP_beta = c(0.675765151939885,
0.516425218613404), SPP_pval = c(0, 0), SPP_R.sq = c(0.479849538274406,
0.378266618442121), EEE_beta = c(0.690521022226874, 0.639380962824289
), EEE_pval = c(0, 0), EEE_R.sq = c(0.585610742768951, 0.676073352909597
)), .Names = c("UDD_beta", "UDD_pval", "UDD_R.sq", "SSX_beta",
"SSX_pval", "SSX_R.sq", "SPP_beta", "SPP_pval", "SPP_R.sq",
"EEE_beta", "EEE_pval", "EEE_R.sq"), row.names = c("DDK", "DDL"
), class = "data.frame")
I want to take R.sq columns and for each row find the max and the column name of the max value. Then take corresponding beta. Expected output:
Name Value
DDK UDD 1.17136554204268
DDL EEE 0.690521022226874
Sorry, the second expected value should be 0.639380962824289.
We could use max.col. Subset the columns of interest i.e. columns that have 'R.sq' using the grep, then get the column index of max value with max.col. Use that to get the column names and also the values that correspond to a particular row (row/column indexing)
i1 <- grep("R.sq", names(df1))
i2 <- max.col(df1[i1], "first")
i3 <- grep("beta", names(df1))
res <- data.frame(Names = sub("_.*", "", names(df1)[i1][i2]),
Value = df1[i3][cbind(1:nrow(df1), i2)])
row.names(res) <- row.names(df1)
sub_data <- data[grep("R.sq", colnames(data))]
colnames(sub_data) <- gsub("_R.sq", "", colnames(sub_data))
sub_data$Name <- NA
sub_data$Value <- NA
for (i in 1:nrow(sub_data)){
sub_data$Name[i] <- names(sub_data[i,])[which.max(apply(sub_data[i,], 2, max))]
sub_data$Value[i] <- max(data[grep(paste0(sub_data$Name[i], "_beta"), colnames(data))], na.rm=T)
}
sub_data[c("Name", "Value")]
# Name Value
#DDK UDD 1.171366
#DDL EEE 0.690521
You can use a tidyverse approach via gathering your df to long and filtering both R.sq vars and max value, i.e.
library(tidyverse)
df %>%
rownames_to_column('ID') %>%
gather(var, val, -ID) %>%
filter(grepl('R.sq|beta', var)) %>%
group_by(ID) %>%
mutate(max1=as.integer(val == max(val[grepl('R.sq', var)]))) %>%
group_by(ID, grp = sub('_.*', '', var)) %>%
filter(!all(max1 == 0) & grepl('beta', var)) %>%
ungroup() %>% select(-c(max1, grp))
which gives,
# A tibble: 2 x 3
ID var val
<chr> <chr> <dbl>
1 DDK UDD_beta 1.171366
2 DDL EEE_beta 0.639381
# Need ID for all possible betas and Rsq
ID <- gsub("_R.sq", "", grep("_R.sq$", names(INPUT), value = TRUE))
dummy <- function(x) {
# Find out which Rsq is largest
i <- ID[which.max(x[paste0(ID, "_R.sq")])]
# Extract beta for largest Rsq
data.frame(Name = i, Value = x[paste0(i, "_beta")])
}
do.call("rbind", apply(INPUT, 1, dummy))

Resources