transpose/reshape with customised names - r

To create the sample needed:
require(pacman)
p_load(data.table)
DT_start <- data.table(ID = c(1,1,1,2,2,2), valueA = c("a1","a2","a3","b1","b2","b3"), valueB = c("A1","A2","A3","B1","B2","B3"))
DT_end <- data.table(ID = c(1,2)
, T01_valueA = c("a1","b1")
, T02_valueA = c("a2","b2")
, T03_valueA = c("a3","b3")
, T01_valueB = c("A1","B1")
, T02_valueB = c("A2","B2")
, T03_valueB = c("A3","B3"))
setcolorder(DT_end, c("ID","T01_valueA","T01_valueB","T02_valueA","T02_valueB","T03_valueA","T03_valueB"))
I have:
> DT_start
ID valueA valueB
1: 1 a1 A1
2: 1 a2 A2
3: 1 a3 A3
4: 2 b1 B1
5: 2 b2 B2
6: 2 b3 B3
I need:
> DT_end
ID T01_valueA T01_valueB T02_valueA T02_valueB T03_valueA T03_valueB
1: 1 a1 A1 a2 A2 a3 A3
2: 2 b1 B1 b2 B2 b3 B3
How to achieve it? basically transpose DT_start to DT_end with customized names: T01, T02, T03...

Using the input DT in the Note at the end we create a sequence within ID column s, melt it to long form and then dcast it back to the desired wide form. (The dcast formula could alternately be written as ID ~ s + variable.)
library(data.table)
DT[, s := sprintf("T%02d", seq_along(.I)), ID]
m <- melt(DT, id.vars = c("ID", "s"))
dcast(m, ID ~ ...)
giving:
ID T01_valueA T01_valueB T02_valueA T02_valueB T03_valueA T03_valueB
1: 1 a1 A1 a2 A2 a3 A3
2: 2 b1 B1 b2 B2 b3 B3
Note:
Input used:
library(data.table)
DF <- structure(list(ID = c(1L, 1L, 1L, 2L, 2L, 2L), valueA = c("a1",
"a2", "a3", "b1", "b2", "b3"), valueB = c("A1", "A2", "A3", "B1",
"B2", "B3")), class = "data.frame",
row.names = c(NA, -6L))
DT <- as.data.table(DF)

Related

How to use filter across and str_detect together to filter conditional on mutlitple columns

I have this dataframe:
df <- structure(list(col1 = c("Z2", "A2", "B2", "C2", "A2", "E2", "F2",
"G2"), col2 = c("Z2", "Z2", "A2", "B2", "C2", "D2", "A2", "F2"
), col3 = c("A2", "B2", "C2", "D2", "E2", "F2", "G2", "Z2")), class = "data.frame", row.names = c(NA, -8L))
> df
col1 col2 col3
1 Z2 Z2 A2
2 A2 Z2 B2
3 B2 A2 C2
4 C2 B2 D2
5 A2 C2 E2
6 E2 D2 F2
7 F2 A2 G2
8 G2 F2 Z2
I would like to use explicitly filter, across and str_detect in a tidyverse setting to filter all rows that start with an A over col1:col3.
Expected result:
col1 col2 col3
1 Z2 Z2 A2
2 A2 Z2 B2
3 B2 A2 C2
4 A2 C2 E2
5 F2 A2 G2
I have tried:
library(dplyr)
library(stringr)
df %>%
filter(across(c(col1, col2, col3), ~str_detect(., "^A")))
This gives:
[1] col1 col2 col3
<0 Zeilen> (oder row.names mit Länge 0)
I want to learn why this code is not working using filter, across and str_detect!
We can use if_any as across will look for & condition i.e. all columns should meet the condition for a particular row to get filtered
library(dplyr)
library(stringr)
df %>%
filter(if_any(everything(), ~str_detect(., "^A")))
-output
col1 col2 col3
1 Z2 Z2 A2
2 A2 Z2 B2
3 B2 A2 C2
4 A2 C2 E2
5 F2 A2 G2
According to ?across
if_any() and if_all() apply the same predicate function to a selection of columns and combine the results into a single logical vector: if_any() is TRUE when the predicate is TRUE for any of the selected columns, if_all() is TRUE when the predicate is TRUE for all selected columns.
across() supersedes the family of "scoped variants" like summarise_at(), summarise_if(), and summarise_all().
The if_any/if_all are not part of the scoped variants

how to create new variables from one variable using two rules

I would appreciate any help to create new variables from one variable.
Specifically, I need help to simultaneously create one row per each ID and various columns of E, where each of the new columns of E, (that is, E1, E2, E3) contains the values of E for each row of ID. I tried doing this which melt followed by spread but I am getting the error:
Error: Duplicate identifiers for rows (4, 7, 9), (1, 3, 6), (2, 5, 8)
Additionally, I tried the solutions discussed here and here but these did not work for my case because I need to be able to create row identifiers for rows (4, 1, 2), (7, 3, 5), and (9, 6, 8). That is, E for rows (4, 1, 2) should be named E1, E for rows (7, 3, 5) should be named E2, E for rows (9, 6, 8) should be named E3, and so on.
#data
dT<-structure(list(A = c("a1", "a2", "a1", "a1", "a2", "a1", "a1",
"a2", "a1"), B = c("b2", "b2", "b2", "b1", "b2", "b2", "b1",
"b2", "b1"), ID = c("3", "4", "3", "1", "4", "3", "1", "4", "1"
), E = c(0.621142094943352, 0.742109450696123, 0.39439152996948,
0.40694392882818, 0.779607277916503, 0.550579323666347, 0.352622183880119,
0.690660491345867, 0.23378944873769)), class = c("data.table",
"data.frame"), row.names = c(NA, -9L))
#my attempt
A B ID E
1: a1 b2 3 0.6211421
2: a2 b2 4 0.7421095
3: a1 b2 3 0.3943915
4: a1 b1 1 0.4069439
5: a2 b2 4 0.7796073
6: a1 b2 3 0.5505793
7: a1 b1 1 0.3526222
8: a2 b2 4 0.6906605
9: a1 b1 1 0.2337894
aTempDF <- melt(dT, id.vars = c("A", "B", "ID")) )
A B ID variable value
1: a1 b2 3 E 0.6211421
2: a2 b2 4 E 0.7421095
3: a1 b2 3 E 0.3943915
4: a1 b1 1 E 0.4069439
5: a2 b2 4 E 0.7796073
6: a1 b2 3 E 0.5505793
7: a1 b1 1 E 0.3526222
8: a2 b2 4 E 0.6906605
9: a1 b1 1 E 0.2337894
aTempDF%>%spread(variable, value)
Error: Duplicate identifiers for rows (4, 7, 9), (1, 3, 6), (2, 5, 8)
#expected output
A B ID E1 E2 E3
1: a1 b2 3 0.6211421 0.3943915 0.5505793
2: a2 b2 4 0.7421095 0.7796073 0.6906605
3: a1 b1 1 0.4069439 0.3526222 0.2337894
Thanks in advance for any help.
You can use dcast from data.table
library(data.table)
dcast(dT, A + B + ID ~ paste0("E", rowid(ID)))
# A B ID E1 E2 E3
#1 a1 b1 1 0.4069439 0.3526222 0.2337894
#2 a1 b2 3 0.6211421 0.3943915 0.5505793
#3 a2 b2 4 0.7421095 0.7796073 0.6906605
You need to create the correct 'time variable' first which is what rowid(ID) does.
For those looking for a tidyverse solution:
library(tidyverse)
dT <- structure(
list(
A = c("a1", "a2", "a1", "a1", "a2", "a1", "a1", "a2", "a1"),
B = c("b2", "b2", "b2", "b1", "b2", "b2", "b1", "b2", "b1"),
ID = c("3", "4", "3", "1", "4", "3", "1", "4", "1"),
E = c(0.621142094943352, 0.742109450696123, 0.39439152996948, 0.40694392882818,
0.550579323666347, 0.352622183880119, 0.690660491345867, 0.23378944873769,
0.779607277916503)),
class = c("data.table",
"data.frame"),
row.names = c(NA, -9L))
dT %>%
as_tibble() %>% # since dataset is a data.table object
group_by(A, B, ID) %>%
# Just so columns are "E1", "E2", etc.
mutate(rn = glue::glue("E{row_number()}")) %>%
ungroup() %>%
spread(rn, E) %>%
# not necessary, just making output in the same order as your expected output
arrange(desc(B))
# A tibble: 3 x 6
# A B ID E1 E2 E3
# <chr> <chr> <chr> <dbl> <dbl> <dbl>
#1 a1 b2 3 0.621 0.394 0.551
#2 a2 b2 4 0.742 0.780 0.691
#3 a1 b1 1 0.407 0.353 0.234
As mentioned in the accepted answer, you need a "key" variable to spread on first. This is created using row_number() and glue where glue just gives you the proper E1, E2, etc. variable names.
The group_by piece just makes sure that the row numbers are with respect to A, B and ID.
EDIT for tidyr >= 1.0.0
The (not-so) new pivot_ functions supercede gather and spread and eliminate the need to glue the new variable names together in a mutate.
dT %>%
as_tibble() %>% # since dataset is a data.table object
group_by(A, B, ID) %>%
# no longer need to glue (or paste) the names together but still need a row number
mutate(rn = row_number()) %>%
ungroup() %>%
pivot_wider(names_from = rn, values_from = E, names_glue = "E{.name}") %>% # names_glue argument allows for easy transforming of the new variable names
# not necessary, just making output in the same order as your expected output
arrange(desc(B))
# A tibble: 3 x 6
# A B ID E1 E2 E3
# <chr> <chr> <chr> <dbl> <dbl> <dbl>
#1 a1 b2 3 0.621 0.394 0.551
#2 a2 b2 4 0.742 0.780 0.691
#3 a1 b1 1 0.407 0.353 0.234

How to apply a function to dataframe using list

I have a df
id a1 a2
1 x1 y1
2 x2 y2
and another dataframe df2
id name1 name2
1 a1 b1
1 a2 b2
2 a3 b3
3 a4 b4
3 a5 b5
df2 could contain multiple records of unique id's from df1.
I need to join the dataframes in such a way that for each row of df1, i should have one column from first record of df2 and if it exists, second column from second record.
To explain, the output should be like :
id a1 a2 n1 n2
1 x1 y1 a1 a2
2 x2 y2 a3 NA
For doing this I have split df2 on id using split
s <- split(df2, df2$id)
but i'm unsure how to use sapply over that. Any pointers for this
If we are not taking the 'name2' column
library(dplyr)
df2 %>%
filter(id %in% df$id) %>%
select(-name2) %>%
group_by(id) %>%
mutate(rn = paste0("n", row_number())) %>%
spread(rn, name1) %>%
left_join(df, .)
# id a1 a2 n1 n2
#1 1 x1 y1 a1 a2
#2 2 x2 y2 a3 <NA>
data
df <- structure(list(id = 1:2, a1 = c("x1", "x2"), a2 = c("y1", "y2"
)), .Names = c("id", "a1", "a2"), class = "data.frame", row.names = c(NA,
-2L))
df2 <- structure(list(id = c(1L, 1L, 2L, 3L, 3L), name1 = c("a1", "a2",
"a3", "a4", "a5"), name2 = c("b1", "b2", "b3", "b4", "b5")), .Names = c("id",
"name1", "name2"), class = "data.frame", row.names = c(NA, -5L))

Joining two dataframes by concatenating columns

I have two dataframes with the same structure - both have two ID columns and 25 string data columns. I want to join the two and concatenate the strings in the data columns when the IDs match. So, for example:
df_1:
id_1 id_2 col_1 col2 ... col_25
a1 b1 A A ... <NA>
a1 b2 A <NA> ... A
a2 b1 <NA> <NA> ... A
df_2:
id_1 id_2 col_1 col2 ... col_25
a1 b1 B <NA> ... <NA>
a1 b2 <NA> B ... B
a1 b3 B <NA> ... B
Combined, this should give
df_combined:
id_1 id_2 col_1 col2 ... col_25
a1 b1 A, B A ... <NA>
a1 b2 A B ... A, B
a1 b3 B <NA> ... B
a2 b1 <NA> <NA> ... A
When I try to use join or merge, it repeats everything except the ID columns (so I end up with 50 data columns). Do I need to use something else?
Thanks!
You can do this if you don't have any empty string :
library(dplyr)
bind_rows(df_1,df_2) %>%
group_by(id_1,id_2) %>%
summarize_all(~ paste(na.omit(.x),collapse=", ")) %>%
`[<-`(.=="",value=NA)
with magrittr you can avoid the not so pretty '[<-' and replace it by inset
library(magrittr)
bind_rows(df_1,df_2) %>%
group_by(id_1,id_2) %>%
summarize_all(~ paste(na.omit(.x),collapse=", ")) %>%
inset(.=="",value=NA)
There is an alternative solution using melt() and dcast() to reshape the data:
library(data.table)
rbind(setDT(df_1), setDT(df_2))[
, melt(.SD, measure.var = patterns("col"), na.rm = TRUE)][
, dcast(.SD, id_1 + id_2 ~ variable, toString, fill = NA)]
id_1 id_2 col_1 col2 col_25
1: a1 b1 A, B A NA
2: a1 b2 A B A, B
3: a1 b3 B NA B
4: a2 b1 NA NA A
Data
df_1 <- fread(
"id_1 id_2 col_1 col2 ... col_25
a1 b1 A A ... <NA>
a1 b2 A <NA> ... A
a2 b1 <NA> <NA> ... A",
drop = 5L, na.strings = "<NA>"
)
df_2 <- fread(
"id_1 id_2 col_1 col2 ... col_25
a1 b1 B <NA> ... <NA>
a1 b2 <NA> B ... B
a1 b3 B <NA> ... B",
drop = 5L, na.strings = "<NA>"
)
To elaborate to the idea commented by #zx8754, and using dplyr package,
library(dplyr)
df1 %>%
bind_rows(df2) %>%
mutate_at(vars(-contains('id')), funs(replace(., is.na(.), ''))) %>%
group_by(id_1, id_2) %>%
summarise_all(funs(trimws(paste(., collapse = ' ')))) %>%
mutate_all(funs(replace(., . == '', NA)))
which gives,
# A tibble: 4 x 5
# Groups: id_1 [2]
id_1 id_2 col_1 col2 col_25
<chr> <chr> <chr> <chr> <chr>
1 a1 b1 A B A <NA>
2 a1 b2 A B A B
3 a1 b3 B <NA> B
4 a2 b1 <NA> <NA> A
NOTE:
Above script assumes that your NAs are actual NA (not characters)
Your variables are as.character
DATA
dput(df1)
structure(list(id_1 = c("a1", "a1", "a2"), id_2 = c("b1", "b2",
"b1"), col_1 = c("A", "A", NA), col2 = c("A", NA, NA), col_25 = c(NA,
"A", "A")), .Names = c("id_1", "id_2", "col_1", "col2", "col_25"
), row.names = c(NA, -3L), class = "data.frame")
> dput(df2)
structure(list(id_1 = c("a1", "a1", "a1"), id_2 = c("b1", "b2",
"b3"), col_1 = c("B", NA, "B"), col2 = c(NA, "B", NA), col_25 = c(NA,
"B", "B")), .Names = c("id_1", "id_2", "col_1", "col2", "col_25"
), row.names = c(NA, -3L), class = "data.frame")

How to sum up the nth column with the n+1th column in a dataframe

How can i sum up the n th column with the n-1 th column in a dataframe for a subset of columns?
For example i have a dataframe as follows:
ID C1 C2 C3
1 2000-12-24 3d 2d
2 2000-12-24 2d 1d
i want R to do the following:
ID C1 C2 C3
1 2000-12-24 2000-12-24+3d=2000-12-27 2000-12-27+2d=2000-12-29
2 2000-12-24 2000-12-24+2d=2000-12-26 2000-12-26+1d=2000-12-27
so that the final dataframe looks like this:
ID C1 C2 C3 ...
1 2000-12-24 2000-12-27 2000-12-29
2 2000-12-24 2000-12-26 2000-12-27
UPDATE:
The data has been generated accordingly:
library(plyr)
library(lubridate)
library(reshape2)
Heterotransaction <- rgamma(2,shape=3 , scale=1)
ID <- list(1:2)
Elog <- data.frame(ID,Heterotransaction)
Elog$fist_transaction <- "2000-12-24"
Elog$fist_transaction <- as.Date(Elog$fist_transaction, "%Y-%m-%d")
Heterotransaction <- rgamma(2,shape=3 , scale=1)
f.transaction <- function(x){
y<- (rexp(2,x))
duration(y, units = "years")
}
tbtrans<-ldply(Heterotransaction, f.transaction)
purchases<-data.frame(ID,tbtrans)
Elognew<- merge.data.frame(Elog, purchases)
You could try
df1[3:ncol(df1)] <- lapply(3:ncol(df1), function(i) rowSums(df1[2:i]))
df1
# ID C1 C2 C3
#1 1 2 5 7
#2 2 4 7 8
or
df1[-1] <- t(apply(df1[-1], 1, cumsum))
Or another option would be to use Reduce
library(data.table)
setDT(df1)[,2:ncol(df1) := Reduce(`+`, .SD, accumulate=TRUE),
.SDcols=2:ncol(df1)][]
# ID C1 C2 C3
#1: 1 2 5 7
#2: 2 4 7 8
Update
Based on the new dataset, one option would be to modify the first solution
df2[3:ncol(df2)] <- do.call(rbind, lapply(3:ncol(df2), function(i)
as.Date(df2[,2]+cumsum(as.numeric(sub('[^0-9]+', '', df2[,i]))))))
df2[3:ncol(df2)] <- lapply(df2[3:ncol(df2)], as.Date, origin='1970-01-01')
df2
# ID C1 C2 C3
#1 1 2000-12-24 2000-12-27 2000-12-29
#2 2 2000-12-24 2000-12-26 2000-12-27
data
df1 <- structure(list(ID = 1:2, C1 = c(2L, 4L), C2 = c(3L, 3L),
C3 = c(2L, 1L)), .Names = c("ID", "C1", "C2", "C3"),
class = "data.frame", row.names = c(NA, -2L))
df2 <- df2 <- structure(list(ID = 1:2, C1 = structure(c(11315, 11315),
class = "Date"),
C2 = c("3d", "2d"), C3 = c("2d", "1d")), .Names = c("ID",
"C1", "C2", "C3"), row.names = c(NA, -2L), class = "data.frame")

Resources