I have a df
id a1 a2
1 x1 y1
2 x2 y2
and another dataframe df2
id name1 name2
1 a1 b1
1 a2 b2
2 a3 b3
3 a4 b4
3 a5 b5
df2 could contain multiple records of unique id's from df1.
I need to join the dataframes in such a way that for each row of df1, i should have one column from first record of df2 and if it exists, second column from second record.
To explain, the output should be like :
id a1 a2 n1 n2
1 x1 y1 a1 a2
2 x2 y2 a3 NA
For doing this I have split df2 on id using split
s <- split(df2, df2$id)
but i'm unsure how to use sapply over that. Any pointers for this
If we are not taking the 'name2' column
library(dplyr)
df2 %>%
filter(id %in% df$id) %>%
select(-name2) %>%
group_by(id) %>%
mutate(rn = paste0("n", row_number())) %>%
spread(rn, name1) %>%
left_join(df, .)
# id a1 a2 n1 n2
#1 1 x1 y1 a1 a2
#2 2 x2 y2 a3 <NA>
data
df <- structure(list(id = 1:2, a1 = c("x1", "x2"), a2 = c("y1", "y2"
)), .Names = c("id", "a1", "a2"), class = "data.frame", row.names = c(NA,
-2L))
df2 <- structure(list(id = c(1L, 1L, 2L, 3L, 3L), name1 = c("a1", "a2",
"a3", "a4", "a5"), name2 = c("b1", "b2", "b3", "b4", "b5")), .Names = c("id",
"name1", "name2"), class = "data.frame", row.names = c(NA, -5L))
Related
I have this dataframe:
df <- structure(list(col1 = c("Z2", "A2", "B2", "C2", "A2", "E2", "F2",
"G2"), col2 = c("Z2", "Z2", "A2", "B2", "C2", "D2", "A2", "F2"
), col3 = c("A2", "B2", "C2", "D2", "E2", "F2", "G2", "Z2")), class = "data.frame", row.names = c(NA, -8L))
> df
col1 col2 col3
1 Z2 Z2 A2
2 A2 Z2 B2
3 B2 A2 C2
4 C2 B2 D2
5 A2 C2 E2
6 E2 D2 F2
7 F2 A2 G2
8 G2 F2 Z2
I would like to use explicitly filter, across and str_detect in a tidyverse setting to filter all rows that start with an A over col1:col3.
Expected result:
col1 col2 col3
1 Z2 Z2 A2
2 A2 Z2 B2
3 B2 A2 C2
4 A2 C2 E2
5 F2 A2 G2
I have tried:
library(dplyr)
library(stringr)
df %>%
filter(across(c(col1, col2, col3), ~str_detect(., "^A")))
This gives:
[1] col1 col2 col3
<0 Zeilen> (oder row.names mit Länge 0)
I want to learn why this code is not working using filter, across and str_detect!
We can use if_any as across will look for & condition i.e. all columns should meet the condition for a particular row to get filtered
library(dplyr)
library(stringr)
df %>%
filter(if_any(everything(), ~str_detect(., "^A")))
-output
col1 col2 col3
1 Z2 Z2 A2
2 A2 Z2 B2
3 B2 A2 C2
4 A2 C2 E2
5 F2 A2 G2
According to ?across
if_any() and if_all() apply the same predicate function to a selection of columns and combine the results into a single logical vector: if_any() is TRUE when the predicate is TRUE for any of the selected columns, if_all() is TRUE when the predicate is TRUE for all selected columns.
across() supersedes the family of "scoped variants" like summarise_at(), summarise_if(), and summarise_all().
The if_any/if_all are not part of the scoped variants
There is a data frame with multiple values in one column.
I want to change the rows and columns of this data frame.
like this..
data:
result:
What should I do?
Please try this
wide_df <- df %>% group_by(type) %>%
mutate(row= row_number()) %>%
pivot_wider(names_from = type,values_from = value) %>%
select(-row)
Edit I need to check your output in your case, suppose when two rows 1, D, D2 and 2, A, A1 are added in input
Old answer Actually you want to distribute every available value of cols A to D for each id. So I used a temporary code to calculate the number of rows that have to be generated for each id by tmp code first. Thereafter I gathered values by pivoting wider the data and lastly replicating the list to desired number of times.
Follow it like this..
#load libraries
library(dplyr)
library(tidyr)
#calculate number of rows to generate
tmp <- df %>% group_by(id, type) %>%
mutate(tmp = n()) %>%
summarise(tmp = max(tmp)) %>%
group_by(id) %>%
summarise(tmp = prod(tmp))
#store this value in variable n
n <- tmp$tmp
#final code
df %>% pivot_wider(names_from = type, values_from = value,
values_fn = function(x){
l <- list(x)
list(rep(l[[1]], n/length(l[[1]])))
}) %>%
unnest(-id)
# A tibble: 6 x 5
id A B C D
<int> <chr> <chr> <chr> <chr>
1 1 A1 B1 C1 D1
2 1 A1 B2 C2 D1
3 1 A1 B1 C3 D1
4 1 A1 B2 C1 D1
5 1 A1 B1 C2 D1
6 1 A1 B2 C3 D1
dput used
df <- structure(list(id = c(1L, 1L, 1L, 1L, 1L, 1L, 1L), type = c("A",
"B", "B", "C", "C", "C", "D"), value = c("A1", "B1", "B2", "C1",
"C2", "C3", "D1")), class = "data.frame", row.names = c(NA, -7L
))
I think you can do this with unstack and expand.grid in case order does not matter and id is not used:
expand.grid(unstack(x[3:2]))
# A B C D
#1 A1 B1 C1 D1
#2 A1 B2 C1 D1
#3 A1 B1 C2 D1
#4 A1 B2 C2 D1
#5 A1 B1 C3 D1
#6 A1 B2 C3 D1
Data:
x <- data.frame(id = 1, type = c("A", "B", "B", "C", "C", "C", "D")
, value = c("A1", "B1", "B2", "C1", "C2", "C3", "D1"))
Sadly, the posted answers didn't get the right results.
So I got the results my way, But I don't know if this is an efficient way.
df <- data.frame(id = 1, type = c("A", "B", "B", "C", "C", "C", "D")
, val = c("A1", "B1", "B2", "C1", "C2", "C3", "D1"))
tmp <- aggregate(val~id+type, df, toString)
result_df <- tmp %>% spread(key = "type", value = "val") %>%
separate_rows(A, sep=",") %>% separate_rows(B, sep=",") %>% separate_rows(C, sep=",") %>% separate_rows(D, sep=",")
result:
Anyway, thank you all so much!
To create the sample needed:
require(pacman)
p_load(data.table)
DT_start <- data.table(ID = c(1,1,1,2,2,2), valueA = c("a1","a2","a3","b1","b2","b3"), valueB = c("A1","A2","A3","B1","B2","B3"))
DT_end <- data.table(ID = c(1,2)
, T01_valueA = c("a1","b1")
, T02_valueA = c("a2","b2")
, T03_valueA = c("a3","b3")
, T01_valueB = c("A1","B1")
, T02_valueB = c("A2","B2")
, T03_valueB = c("A3","B3"))
setcolorder(DT_end, c("ID","T01_valueA","T01_valueB","T02_valueA","T02_valueB","T03_valueA","T03_valueB"))
I have:
> DT_start
ID valueA valueB
1: 1 a1 A1
2: 1 a2 A2
3: 1 a3 A3
4: 2 b1 B1
5: 2 b2 B2
6: 2 b3 B3
I need:
> DT_end
ID T01_valueA T01_valueB T02_valueA T02_valueB T03_valueA T03_valueB
1: 1 a1 A1 a2 A2 a3 A3
2: 2 b1 B1 b2 B2 b3 B3
How to achieve it? basically transpose DT_start to DT_end with customized names: T01, T02, T03...
Using the input DT in the Note at the end we create a sequence within ID column s, melt it to long form and then dcast it back to the desired wide form. (The dcast formula could alternately be written as ID ~ s + variable.)
library(data.table)
DT[, s := sprintf("T%02d", seq_along(.I)), ID]
m <- melt(DT, id.vars = c("ID", "s"))
dcast(m, ID ~ ...)
giving:
ID T01_valueA T01_valueB T02_valueA T02_valueB T03_valueA T03_valueB
1: 1 a1 A1 a2 A2 a3 A3
2: 2 b1 B1 b2 B2 b3 B3
Note:
Input used:
library(data.table)
DF <- structure(list(ID = c(1L, 1L, 1L, 2L, 2L, 2L), valueA = c("a1",
"a2", "a3", "b1", "b2", "b3"), valueB = c("A1", "A2", "A3", "B1",
"B2", "B3")), class = "data.frame",
row.names = c(NA, -6L))
DT <- as.data.table(DF)
I have two dataframes with the same structure - both have two ID columns and 25 string data columns. I want to join the two and concatenate the strings in the data columns when the IDs match. So, for example:
df_1:
id_1 id_2 col_1 col2 ... col_25
a1 b1 A A ... <NA>
a1 b2 A <NA> ... A
a2 b1 <NA> <NA> ... A
df_2:
id_1 id_2 col_1 col2 ... col_25
a1 b1 B <NA> ... <NA>
a1 b2 <NA> B ... B
a1 b3 B <NA> ... B
Combined, this should give
df_combined:
id_1 id_2 col_1 col2 ... col_25
a1 b1 A, B A ... <NA>
a1 b2 A B ... A, B
a1 b3 B <NA> ... B
a2 b1 <NA> <NA> ... A
When I try to use join or merge, it repeats everything except the ID columns (so I end up with 50 data columns). Do I need to use something else?
Thanks!
You can do this if you don't have any empty string :
library(dplyr)
bind_rows(df_1,df_2) %>%
group_by(id_1,id_2) %>%
summarize_all(~ paste(na.omit(.x),collapse=", ")) %>%
`[<-`(.=="",value=NA)
with magrittr you can avoid the not so pretty '[<-' and replace it by inset
library(magrittr)
bind_rows(df_1,df_2) %>%
group_by(id_1,id_2) %>%
summarize_all(~ paste(na.omit(.x),collapse=", ")) %>%
inset(.=="",value=NA)
There is an alternative solution using melt() and dcast() to reshape the data:
library(data.table)
rbind(setDT(df_1), setDT(df_2))[
, melt(.SD, measure.var = patterns("col"), na.rm = TRUE)][
, dcast(.SD, id_1 + id_2 ~ variable, toString, fill = NA)]
id_1 id_2 col_1 col2 col_25
1: a1 b1 A, B A NA
2: a1 b2 A B A, B
3: a1 b3 B NA B
4: a2 b1 NA NA A
Data
df_1 <- fread(
"id_1 id_2 col_1 col2 ... col_25
a1 b1 A A ... <NA>
a1 b2 A <NA> ... A
a2 b1 <NA> <NA> ... A",
drop = 5L, na.strings = "<NA>"
)
df_2 <- fread(
"id_1 id_2 col_1 col2 ... col_25
a1 b1 B <NA> ... <NA>
a1 b2 <NA> B ... B
a1 b3 B <NA> ... B",
drop = 5L, na.strings = "<NA>"
)
To elaborate to the idea commented by #zx8754, and using dplyr package,
library(dplyr)
df1 %>%
bind_rows(df2) %>%
mutate_at(vars(-contains('id')), funs(replace(., is.na(.), ''))) %>%
group_by(id_1, id_2) %>%
summarise_all(funs(trimws(paste(., collapse = ' ')))) %>%
mutate_all(funs(replace(., . == '', NA)))
which gives,
# A tibble: 4 x 5
# Groups: id_1 [2]
id_1 id_2 col_1 col2 col_25
<chr> <chr> <chr> <chr> <chr>
1 a1 b1 A B A <NA>
2 a1 b2 A B A B
3 a1 b3 B <NA> B
4 a2 b1 <NA> <NA> A
NOTE:
Above script assumes that your NAs are actual NA (not characters)
Your variables are as.character
DATA
dput(df1)
structure(list(id_1 = c("a1", "a1", "a2"), id_2 = c("b1", "b2",
"b1"), col_1 = c("A", "A", NA), col2 = c("A", NA, NA), col_25 = c(NA,
"A", "A")), .Names = c("id_1", "id_2", "col_1", "col2", "col_25"
), row.names = c(NA, -3L), class = "data.frame")
> dput(df2)
structure(list(id_1 = c("a1", "a1", "a1"), id_2 = c("b1", "b2",
"b3"), col_1 = c("B", NA, "B"), col2 = c(NA, "B", NA), col_25 = c(NA,
"B", "B")), .Names = c("id_1", "id_2", "col_1", "col2", "col_25"
), row.names = c(NA, -3L), class = "data.frame")
How can i sum up the n th column with the n-1 th column in a dataframe for a subset of columns?
For example i have a dataframe as follows:
ID C1 C2 C3
1 2000-12-24 3d 2d
2 2000-12-24 2d 1d
i want R to do the following:
ID C1 C2 C3
1 2000-12-24 2000-12-24+3d=2000-12-27 2000-12-27+2d=2000-12-29
2 2000-12-24 2000-12-24+2d=2000-12-26 2000-12-26+1d=2000-12-27
so that the final dataframe looks like this:
ID C1 C2 C3 ...
1 2000-12-24 2000-12-27 2000-12-29
2 2000-12-24 2000-12-26 2000-12-27
UPDATE:
The data has been generated accordingly:
library(plyr)
library(lubridate)
library(reshape2)
Heterotransaction <- rgamma(2,shape=3 , scale=1)
ID <- list(1:2)
Elog <- data.frame(ID,Heterotransaction)
Elog$fist_transaction <- "2000-12-24"
Elog$fist_transaction <- as.Date(Elog$fist_transaction, "%Y-%m-%d")
Heterotransaction <- rgamma(2,shape=3 , scale=1)
f.transaction <- function(x){
y<- (rexp(2,x))
duration(y, units = "years")
}
tbtrans<-ldply(Heterotransaction, f.transaction)
purchases<-data.frame(ID,tbtrans)
Elognew<- merge.data.frame(Elog, purchases)
You could try
df1[3:ncol(df1)] <- lapply(3:ncol(df1), function(i) rowSums(df1[2:i]))
df1
# ID C1 C2 C3
#1 1 2 5 7
#2 2 4 7 8
or
df1[-1] <- t(apply(df1[-1], 1, cumsum))
Or another option would be to use Reduce
library(data.table)
setDT(df1)[,2:ncol(df1) := Reduce(`+`, .SD, accumulate=TRUE),
.SDcols=2:ncol(df1)][]
# ID C1 C2 C3
#1: 1 2 5 7
#2: 2 4 7 8
Update
Based on the new dataset, one option would be to modify the first solution
df2[3:ncol(df2)] <- do.call(rbind, lapply(3:ncol(df2), function(i)
as.Date(df2[,2]+cumsum(as.numeric(sub('[^0-9]+', '', df2[,i]))))))
df2[3:ncol(df2)] <- lapply(df2[3:ncol(df2)], as.Date, origin='1970-01-01')
df2
# ID C1 C2 C3
#1 1 2000-12-24 2000-12-27 2000-12-29
#2 2 2000-12-24 2000-12-26 2000-12-27
data
df1 <- structure(list(ID = 1:2, C1 = c(2L, 4L), C2 = c(3L, 3L),
C3 = c(2L, 1L)), .Names = c("ID", "C1", "C2", "C3"),
class = "data.frame", row.names = c(NA, -2L))
df2 <- df2 <- structure(list(ID = 1:2, C1 = structure(c(11315, 11315),
class = "Date"),
C2 = c("3d", "2d"), C3 = c("2d", "1d")), .Names = c("ID",
"C1", "C2", "C3"), row.names = c(NA, -2L), class = "data.frame")