I would appreciate any help to create new variables from one variable.
Specifically, I need help to simultaneously create one row per each ID and various columns of E, where each of the new columns of E, (that is, E1, E2, E3) contains the values of E for each row of ID. I tried doing this which melt followed by spread but I am getting the error:
Error: Duplicate identifiers for rows (4, 7, 9), (1, 3, 6), (2, 5, 8)
Additionally, I tried the solutions discussed here and here but these did not work for my case because I need to be able to create row identifiers for rows (4, 1, 2), (7, 3, 5), and (9, 6, 8). That is, E for rows (4, 1, 2) should be named E1, E for rows (7, 3, 5) should be named E2, E for rows (9, 6, 8) should be named E3, and so on.
#data
dT<-structure(list(A = c("a1", "a2", "a1", "a1", "a2", "a1", "a1",
"a2", "a1"), B = c("b2", "b2", "b2", "b1", "b2", "b2", "b1",
"b2", "b1"), ID = c("3", "4", "3", "1", "4", "3", "1", "4", "1"
), E = c(0.621142094943352, 0.742109450696123, 0.39439152996948,
0.40694392882818, 0.779607277916503, 0.550579323666347, 0.352622183880119,
0.690660491345867, 0.23378944873769)), class = c("data.table",
"data.frame"), row.names = c(NA, -9L))
#my attempt
A B ID E
1: a1 b2 3 0.6211421
2: a2 b2 4 0.7421095
3: a1 b2 3 0.3943915
4: a1 b1 1 0.4069439
5: a2 b2 4 0.7796073
6: a1 b2 3 0.5505793
7: a1 b1 1 0.3526222
8: a2 b2 4 0.6906605
9: a1 b1 1 0.2337894
aTempDF <- melt(dT, id.vars = c("A", "B", "ID")) )
A B ID variable value
1: a1 b2 3 E 0.6211421
2: a2 b2 4 E 0.7421095
3: a1 b2 3 E 0.3943915
4: a1 b1 1 E 0.4069439
5: a2 b2 4 E 0.7796073
6: a1 b2 3 E 0.5505793
7: a1 b1 1 E 0.3526222
8: a2 b2 4 E 0.6906605
9: a1 b1 1 E 0.2337894
aTempDF%>%spread(variable, value)
Error: Duplicate identifiers for rows (4, 7, 9), (1, 3, 6), (2, 5, 8)
#expected output
A B ID E1 E2 E3
1: a1 b2 3 0.6211421 0.3943915 0.5505793
2: a2 b2 4 0.7421095 0.7796073 0.6906605
3: a1 b1 1 0.4069439 0.3526222 0.2337894
Thanks in advance for any help.
You can use dcast from data.table
library(data.table)
dcast(dT, A + B + ID ~ paste0("E", rowid(ID)))
# A B ID E1 E2 E3
#1 a1 b1 1 0.4069439 0.3526222 0.2337894
#2 a1 b2 3 0.6211421 0.3943915 0.5505793
#3 a2 b2 4 0.7421095 0.7796073 0.6906605
You need to create the correct 'time variable' first which is what rowid(ID) does.
For those looking for a tidyverse solution:
library(tidyverse)
dT <- structure(
list(
A = c("a1", "a2", "a1", "a1", "a2", "a1", "a1", "a2", "a1"),
B = c("b2", "b2", "b2", "b1", "b2", "b2", "b1", "b2", "b1"),
ID = c("3", "4", "3", "1", "4", "3", "1", "4", "1"),
E = c(0.621142094943352, 0.742109450696123, 0.39439152996948, 0.40694392882818,
0.550579323666347, 0.352622183880119, 0.690660491345867, 0.23378944873769,
0.779607277916503)),
class = c("data.table",
"data.frame"),
row.names = c(NA, -9L))
dT %>%
as_tibble() %>% # since dataset is a data.table object
group_by(A, B, ID) %>%
# Just so columns are "E1", "E2", etc.
mutate(rn = glue::glue("E{row_number()}")) %>%
ungroup() %>%
spread(rn, E) %>%
# not necessary, just making output in the same order as your expected output
arrange(desc(B))
# A tibble: 3 x 6
# A B ID E1 E2 E3
# <chr> <chr> <chr> <dbl> <dbl> <dbl>
#1 a1 b2 3 0.621 0.394 0.551
#2 a2 b2 4 0.742 0.780 0.691
#3 a1 b1 1 0.407 0.353 0.234
As mentioned in the accepted answer, you need a "key" variable to spread on first. This is created using row_number() and glue where glue just gives you the proper E1, E2, etc. variable names.
The group_by piece just makes sure that the row numbers are with respect to A, B and ID.
EDIT for tidyr >= 1.0.0
The (not-so) new pivot_ functions supercede gather and spread and eliminate the need to glue the new variable names together in a mutate.
dT %>%
as_tibble() %>% # since dataset is a data.table object
group_by(A, B, ID) %>%
# no longer need to glue (or paste) the names together but still need a row number
mutate(rn = row_number()) %>%
ungroup() %>%
pivot_wider(names_from = rn, values_from = E, names_glue = "E{.name}") %>% # names_glue argument allows for easy transforming of the new variable names
# not necessary, just making output in the same order as your expected output
arrange(desc(B))
# A tibble: 3 x 6
# A B ID E1 E2 E3
# <chr> <chr> <chr> <dbl> <dbl> <dbl>
#1 a1 b2 3 0.621 0.394 0.551
#2 a2 b2 4 0.742 0.780 0.691
#3 a1 b1 1 0.407 0.353 0.234
Related
I have this dataframe:
df <- structure(list(col1 = c("Z2", "A2", "B2", "C2", "A2", "E2", "F2",
"G2"), col2 = c("Z2", "Z2", "A2", "B2", "C2", "D2", "A2", "F2"
), col3 = c("A2", "B2", "C2", "D2", "E2", "F2", "G2", "Z2")), class = "data.frame", row.names = c(NA, -8L))
> df
col1 col2 col3
1 Z2 Z2 A2
2 A2 Z2 B2
3 B2 A2 C2
4 C2 B2 D2
5 A2 C2 E2
6 E2 D2 F2
7 F2 A2 G2
8 G2 F2 Z2
I would like to use explicitly filter, across and str_detect in a tidyverse setting to filter all rows that start with an A over col1:col3.
Expected result:
col1 col2 col3
1 Z2 Z2 A2
2 A2 Z2 B2
3 B2 A2 C2
4 A2 C2 E2
5 F2 A2 G2
I have tried:
library(dplyr)
library(stringr)
df %>%
filter(across(c(col1, col2, col3), ~str_detect(., "^A")))
This gives:
[1] col1 col2 col3
<0 Zeilen> (oder row.names mit Länge 0)
I want to learn why this code is not working using filter, across and str_detect!
We can use if_any as across will look for & condition i.e. all columns should meet the condition for a particular row to get filtered
library(dplyr)
library(stringr)
df %>%
filter(if_any(everything(), ~str_detect(., "^A")))
-output
col1 col2 col3
1 Z2 Z2 A2
2 A2 Z2 B2
3 B2 A2 C2
4 A2 C2 E2
5 F2 A2 G2
According to ?across
if_any() and if_all() apply the same predicate function to a selection of columns and combine the results into a single logical vector: if_any() is TRUE when the predicate is TRUE for any of the selected columns, if_all() is TRUE when the predicate is TRUE for all selected columns.
across() supersedes the family of "scoped variants" like summarise_at(), summarise_if(), and summarise_all().
The if_any/if_all are not part of the scoped variants
This question already has answers here:
How to create a consecutive group number
(13 answers)
Closed 1 year ago.
I have a dataframe for multiple products and different date ranges. I want to assign unique value to each date so that even if the starting dates are different for various products, I can group by the dates.
df
acc product date
a1 p1 d1
a1 p1 d2
a1 p1 d3
a1 p1 d4
a1 p2 d1
a1 p2 d2
a1 p2 d3
a1 p3 d3
a1 p3 d4
I want to arrange the dates so that there is a unique identifier each for d1, d2, d3 etc.
I used the following code to try this:
df <- df %>% group_by(acc, product) %>% mutate(t = row_number())
Output
df
acc product date t EXPECTED
a1 p1 d1 1 1
a1 p1 d2 2 2
a1 p1 d3 3 3
a1 p1 d4 4 4
a1 p2 d1 1 1
a1 p2 d2 2 2
a1 p2 d3 3 3
a1 p3 d3 1 3
a1 p3 d4 2 4
Any suggestions for this?
use dplyr::dense_rank()
df %>% mutate(new = dense_rank(date))
acc product date new
1 a1 p1 d1 1
2 a1 p1 d2 2
3 a1 p1 d3 3
4 a1 p1 d4 4
5 a1 p2 d1 1
6 a1 p2 d2 2
7 a1 p2 d3 3
8 a1 p3 d3 3
9 a1 p3 d4 4
If however, you want to restart ranks for each acc use group_by before the mutate statement.
dput used
df <- structure(list(acc = c("a1", "a1", "a1", "a1", "a1", "a1", "a1",
"a1", "a1"), product = c("p1", "p1", "p1", "p1", "p2", "p2",
"p2", "p3", "p3"), date = c("d1", "d2", "d3", "d4", "d1", "d2",
"d3", "d3", "d4")), class = "data.frame", row.names = c(NA, -9L
))
I have two data sets, data1 and data2:
data1 <- data.frame(ID = 1:6,
A = c("a1", "a2", NA, "a4", "a5", NA),
B = c("b1", "b2", "b3", NA, "b5", NA),
stringsAsFactors = FALSE)
data1
ID A B
1 a1 b1
2 a2 b2
3 NA b3
4 a4 NA
5 a5 b5
6 NA NA
and
data2 <- data.frame(ID = 1:6,
A = c(NA, "a2", "a3", NA, "a5", "a6"),
B = c(NA, "b2.wrong", NA, "b4", "b5", "b6"),
stringsAsFactors = FALSE)
data2
ID A B
1 NA NA
2 a2 b2.wrong
3 a3 NA
4 NA b4
5 a5 b5
6 a6 b6
I would like to merge them by ID so that the resultant merged dataset, data.merged, populates fields form both datasets, but chooses values from data1 whenever there are possible values from both datasets.
I.e., I would like the final dataset, data.merge, to be:
ID A B
1 a1 b1
2 a2 b2
3 a3 b3
4 a4 b4
5 a5 b5
6 a6 b6
I have looked around, finding similar but not exact answers.
You can join the data and use coalesce to select the first non-NA value.
library(dplyr)
data1 %>%
inner_join(data2, by = 'ID') %>%
mutate(A = coalesce(A.x, A.y),
B = coalesce(B.x, B.y)) %>%
select(names(data1))
# ID A B
#1 1 a1 b1
#2 2 a2 b2
#3 3 a3 b3
#4 4 a4 b4
#5 5 a5 b5
#6 6 a6 b6
Or in base R comparing values with NA :
transform(merge(data1, data2, by = 'ID'),
A = ifelse(is.na(A.x), A.y, A.x),
B = ifelse(is.na(B.x), B.y, B.x))[names(data1)]
I have two dataframes with the same structure - both have two ID columns and 25 string data columns. I want to join the two and concatenate the strings in the data columns when the IDs match. So, for example:
df_1:
id_1 id_2 col_1 col2 ... col_25
a1 b1 A A ... <NA>
a1 b2 A <NA> ... A
a2 b1 <NA> <NA> ... A
df_2:
id_1 id_2 col_1 col2 ... col_25
a1 b1 B <NA> ... <NA>
a1 b2 <NA> B ... B
a1 b3 B <NA> ... B
Combined, this should give
df_combined:
id_1 id_2 col_1 col2 ... col_25
a1 b1 A, B A ... <NA>
a1 b2 A B ... A, B
a1 b3 B <NA> ... B
a2 b1 <NA> <NA> ... A
When I try to use join or merge, it repeats everything except the ID columns (so I end up with 50 data columns). Do I need to use something else?
Thanks!
You can do this if you don't have any empty string :
library(dplyr)
bind_rows(df_1,df_2) %>%
group_by(id_1,id_2) %>%
summarize_all(~ paste(na.omit(.x),collapse=", ")) %>%
`[<-`(.=="",value=NA)
with magrittr you can avoid the not so pretty '[<-' and replace it by inset
library(magrittr)
bind_rows(df_1,df_2) %>%
group_by(id_1,id_2) %>%
summarize_all(~ paste(na.omit(.x),collapse=", ")) %>%
inset(.=="",value=NA)
There is an alternative solution using melt() and dcast() to reshape the data:
library(data.table)
rbind(setDT(df_1), setDT(df_2))[
, melt(.SD, measure.var = patterns("col"), na.rm = TRUE)][
, dcast(.SD, id_1 + id_2 ~ variable, toString, fill = NA)]
id_1 id_2 col_1 col2 col_25
1: a1 b1 A, B A NA
2: a1 b2 A B A, B
3: a1 b3 B NA B
4: a2 b1 NA NA A
Data
df_1 <- fread(
"id_1 id_2 col_1 col2 ... col_25
a1 b1 A A ... <NA>
a1 b2 A <NA> ... A
a2 b1 <NA> <NA> ... A",
drop = 5L, na.strings = "<NA>"
)
df_2 <- fread(
"id_1 id_2 col_1 col2 ... col_25
a1 b1 B <NA> ... <NA>
a1 b2 <NA> B ... B
a1 b3 B <NA> ... B",
drop = 5L, na.strings = "<NA>"
)
To elaborate to the idea commented by #zx8754, and using dplyr package,
library(dplyr)
df1 %>%
bind_rows(df2) %>%
mutate_at(vars(-contains('id')), funs(replace(., is.na(.), ''))) %>%
group_by(id_1, id_2) %>%
summarise_all(funs(trimws(paste(., collapse = ' ')))) %>%
mutate_all(funs(replace(., . == '', NA)))
which gives,
# A tibble: 4 x 5
# Groups: id_1 [2]
id_1 id_2 col_1 col2 col_25
<chr> <chr> <chr> <chr> <chr>
1 a1 b1 A B A <NA>
2 a1 b2 A B A B
3 a1 b3 B <NA> B
4 a2 b1 <NA> <NA> A
NOTE:
Above script assumes that your NAs are actual NA (not characters)
Your variables are as.character
DATA
dput(df1)
structure(list(id_1 = c("a1", "a1", "a2"), id_2 = c("b1", "b2",
"b1"), col_1 = c("A", "A", NA), col2 = c("A", NA, NA), col_25 = c(NA,
"A", "A")), .Names = c("id_1", "id_2", "col_1", "col2", "col_25"
), row.names = c(NA, -3L), class = "data.frame")
> dput(df2)
structure(list(id_1 = c("a1", "a1", "a1"), id_2 = c("b1", "b2",
"b3"), col_1 = c("B", NA, "B"), col2 = c(NA, "B", NA), col_25 = c(NA,
"B", "B")), .Names = c("id_1", "id_2", "col_1", "col2", "col_25"
), row.names = c(NA, -3L), class = "data.frame")
I have two data.frames--one look-up table that tells me a set products included in a group. Each group has at least one product of Type 1 and Type 2.
The second data.frame tells me details about the transaction. Each transaction can have one of the following products:
a) Only products of Type 1 from one of the groups
b) Only products of Type 2 from one of the groups
c) Product of Type 1 and Type 2 from the same group
For my analysis, I am interested in finding out c) above i.e. how many transactions have products of Type 1 and Type 2 (from the same group) sold. We will ignore the transaction altogether if Product of Type 1 and that of Type 2 from different groups that are sold in the same transaction.
Thus, each product of Type 1 or Type 2 MUST belong to the same group.
Here's my look up table:
> P_Lookup
Group ProductID1 ProductID2
Group1 A 1
Group1 B 2
Group1 B 3
Group2 C 4
Group2 C 5
Group2 C 6
Group3 D 7
Group3 C 8
Group3 C 9
Group4 E 10
Group4 F 11
Group4 G 12
Group5 H 13
Group5 H 14
Group5 H 15
For instance, I won't have Product G and Product 15 in one transaction because they belong to different group.
Here are the transactions:
TransactionID ProductID ProductType
a1 A 1
a1 B 1
a1 1 2
a2 C 1
a2 4 2
a2 5 2
a3 D 1
a3 C 1
a3 7 2
a3 8 2
a4 H 1
a5 1 2
a5 2 2
a5 3 2
a5 3 2
a5 1 2
a6 H 1
a6 15 2
My Code:
Now, I was able to write code using dplyr for shortlisting transactions from one group. However, I am not sure how I can vectorize my code for all groups.
Here's my code:
P_Groups<-unique(P_Lookup$Group)
Chosen_Group<-P_Groups[5]
P_Group_Ind <- P_Trans %>%
group_by(TransactionID)%>%
dplyr::filter((ProductID %in% unique(P_Lookup[P_Lookup$Group==Chosen_Group,]$ProductID1)) |
(ProductID %in% unique(P_Lookup[P_Lookup$Group==Chosen_Group,]$ProductID2)) ) %>%
mutate(No_of_PIDs = n_distinct(ProductType)) %>%
mutate(Group_Name = Chosen_Group)
P_Group_Ind<-P_Group_Ind[P_Group_Ind$No_of_PIDs>1,]
This works well as long as I manually select each group i.e. by setting Chosen_Group. However, I am not sure how I can automate this. One way, I am thinking is to use for loop, but I know that the beauty of R is vectorization, so I want to stay away from using for loop.
I'd sincerely appreciate any help. I have spent almost two days on this. I looked at using dplyr in for loop in r, but it seems this thread is talking about a different issue.
DATA:
Here's dput for P_Trans:
structure(list(TransactionID = c("a1", "a1", "a1", "a2", "a2",
"a2", "a3", "a3", "a3", "a3", "a4", "a5", "a5", "a5", "a5", "a5",
"a6", "a6"), ProductID = c("A", "B", "1", "C", "4", "5", "D",
"C", "7", "8", "H", "1", "2", "3", "3", "1", "H", "15"), ProductType = c(1,
1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2)), .Names = c("TransactionID",
"ProductID", "ProductType"), row.names = c(NA, 18L), class = "data.frame")
Here's dput for P_Lookup:
structure(list(Group = c("Group1", "Group1", "Group1", "Group2",
"Group2", "Group2", "Group3", "Group3", "Group3", "Group4", "Group4",
"Group4", "Group5", "Group5", "Group5"), ProductID1 = c("A",
"B", "B", "C", "C", "C", "D", "C", "C", "E", "F", "G", "H", "H",
"H"), ProductID2 = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
14, 15)), .Names = c("Group", "ProductID1", "ProductID2"), row.names = c(NA,
15L), class = "data.frame")
Here's the dput() after adding a product to P_Trans that doesn't exist in the look-up table:
structure(list(TransactionID = c("a1", "a1", "a1", "a2", "a2",
"a2", "a3", "a3", "a3", "a3", "a4", "a5", "a5", "a5", "a5", "a5",
"a6", "a6", "a7"), ProductID = c("A", "B", "1", "C", "4", "5",
"D", "C", "7", "8", "H", "1", "2", "3", "3", "1", "H", "15",
"22"), ProductType = c(1, 1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 2, 2,
2, 2, 2, 1, 2, 3)), .Names = c("TransactionID", "ProductID",
"ProductType"), row.names = c(NA, 19L), class = "data.frame")
Below is a tidyverse (dplyr, tidyr, and purrr) solution that I hope will help.
Note that the use of map_df in the last line returns all results as a data frame. If you'd prefer it to be a list object for each group, then simply use map.
library(dplyr)
library(tidyr)
library(purrr)
# Save unique groups for later use
P_Groups <- unique(P_Lookup$Group)
# Convert lookup table to product IDs and Groups
P_Lookup <- P_Lookup %>%
gather(ProductIDn, ProductID, ProductID1, ProductID2) %>%
select(ProductID, Group) %>%
distinct() %>%
nest(-ProductID, .key = Group)
# Bind Group information to transactions
# and group for next analysis
P_Trans <- P_Trans %>%
left_join(P_Lookup) %>%
filter(!map_lgl(Group, is.null)) %>%
unnest(Group) %>%
group_by(TransactionID)
# Iterate through Groups to produce results
map(P_Groups, ~ filter(P_Trans, Group == .)) %>%
map(~ mutate(., No_of_PIDs = n_distinct(ProductType))) %>%
map_df(~ filter(., No_of_PIDs > 1))
#> Source: local data frame [12 x 5]
#> Groups: TransactionID [4]
#>
#> TransactionID ProductID ProductType Group No_of_PIDs
#> <chr> <chr> <dbl> <chr> <int>
#> 1 a1 A 1 Group1 2
#> 2 a1 B 1 Group1 2
#> 3 a1 1 2 Group1 2
#> 4 a2 C 1 Group2 2
#> 5 a2 4 2 Group2 2
#> 6 a2 5 2 Group2 2
#> 7 a3 D 1 Group3 2
#> 8 a3 C 1 Group3 2
#> 9 a3 7 2 Group3 2
#> 10 a3 8 2 Group3 2
#> 11 a6 H 1 Group5 2
#> 12 a6 15 2 Group5 2
Here is a single pipe dplyr solution:
P_DualGroupTransactionsCount <-
P_Lookup %>% # data needing single column map of Keys
gather(IDnum, ProductID, ProductID1:ProductID2) %>% # produce long single map of Keys for GroupID (tidyr::)
right_join(P_trans) %>% # join transactions to groupID info
group_by(TransactionID, Group) %>% # organize for same transaction & same group
mutate(DualGroup = ifelse(n_distinct(ProductType)==2, T, F)) %>% # flag groups with both groups in a single transaction
filter(DualGroup == T) %>% # choose only doubles
select(TransactionID, Group) %>% # remove excess columns
distinct %>% # remove excess rows
nrow # count of unique transaction ID's
# P_DualGroupTransactions
# Source: local data frame [4 x 2]
# Groups: TransactionID, Group [4]
#
# TransactionID Group
# <chr> <chr>
# 1 a1 Group1
# 2 a2 Group2
# 3 a3 Group3
# 4 a6 Group5
# P_DualGroupTransactionsCount
[1] 4