right join with dplyr make rows columns - r

I would like to do right join data1 and data2 by ProductCode and I need to get below desired output table
data1=data.frame(ProductCode=c(1,1,1,2,2,3),region=c("A","A","A","B","B","C"))
data1
ProductCode region
1 A
1 A
1 A
2 B
2 B
3 C
data2=data.frame(ProductCode=c(1,1,1,2,2,3),Period=c("promo1","promo2"
,"promo3","promo2","promo3","promo1"),promosales=c(15,12,7,18,20,2))
data2
ProductCode Period promosales
1 promo1 15
1 promo2 12
1 promo3 7
2 promo2 18
2 promo3 20
3 promo1 2
Desired output table
ProdcutCode region Promo1_sales Promo2_sales Promo3_sales
1 A 15 12 7
2 B 18 20 0
3 C 2 0 0
If I do it with sql, I have to group by after that by maximizing each row
sqldf("select a.*,
case when Period='promo1' then b.promosales else 0 end as
Promo1_sales1,
case when Period='promo2' then b.promosales else 0 end as
Promo1_sales2,
case when Period='promo3' then b.promosales else 0 end as
Promo1_sales3,
case when Period='promo4' then b.promosales else 0 end as
Promo1_sales4
from data1 a
left join data2 b on a.ProductCode=b.ProductCode
")
Can I do it dplyr or anything else?
Thank you.

Not sure this will work in your general case, but you can do:
data1 <- data.frame(ProductCode=c(1,1,1,2,2,3),
region=c(rep('A', 3), rep('B', 2),'C'))
data2 <- data.frame(ProductCode=c(1,1,1,2,2,3),
Period=c("promo1","promo2","promo3","promo2","promo3","promo1"),
promosales=c(15,12,7,18,20,2))
library(dplyr)
library(tidyr)
data1 %>%
distinct() %>%
inner_join(data2, by = 'ProductCode') %>%
group_by(ProductCode) %>%
mutate(rownr = paste0('Promo', row_number(), '_sales')) %>%
select(-Period) %>%
spread(rownr, promosales, fill = 0)
#> # A tibble: 3 x 5
#> # Groups: ProductCode [3]
#> ProductCode region Promo1_sales Promo2_sales Promo3_sales
#> <dbl> <fct> <dbl> <dbl> <dbl>
#> 1 1 A 15 12 7
#> 2 2 B 18 20 0
#> 3 3 C 2 0 0
A better approach would be simpler:
data1 %>%
distinct() %>%
inner_join(data2, by = 'ProductCode') %>%
group_by(ProductCode) %>%
spread(Period, promosales, fill = 0)
#> # A tibble: 3 x 5
#> # Groups: ProductCode [3]
#> ProductCode region promo1 promo2 promo3
#> <dbl> <fct> <dbl> <dbl> <dbl>
#> 1 1 A 15 12 7
#> 2 2 B 0 18 20
#> 3 3 C 2 0 0
Created on 2018-05-23 by the reprex package (v0.2.0).

Related

The Most Efficient Way of Forming Groups using R

I have a tibble dt given as follows:
library(tidyverse)
dt <- tibble(x=as.integer(c(0,0,1,0,0,0,1,1,0,1))) %>%
mutate(grp = as.factor(c(rep("A",3), rep("B",4), rep("C",1), rep("D",2))))
dt
As one can observe the rule for grouping is:
starts 0 and ends with 1 (e.g., groups A, B, D) or
it solely contains 1 (e.g., group C)
Problem: Given a tibble with column integer vector x of zeros and 1 that starts with 0 and ends in 1, what is the most efficient way to obtain a grouping using R? (You can use any grouping symbols/factors.)
We can get the cumulative sum of 'x' (assuming it is binary), take the lag add 1 and use that index to replace it with LETTERS (Note that LETTERS was used only as part of matching with the expected output - it can take go up to certain limit)
library(dplyr)
dt %>%
mutate(grp2 = LETTERS[lag(cumsum(x), default = 0)+ 1])
-output
# A tibble: 10 x 3
x grp grp2
<int> <fct> <chr>
1 0 A A
2 0 A A
3 1 A A
4 0 B B
5 0 B B
6 0 B B
7 1 B B
8 1 C C
9 0 D D
10 1 D D
Though the strategy proposed by Akrun is fantastic, yet to show that it can be managed through accumulate also
library(tidyverse)
dt <- tibble(x=as.integer(c(0,0,1,0,0,0,1,1,0,1))) %>%
mutate(grp = as.factor(c(rep("A",3), rep("B",4), rep("C",1), rep("D",2))))
dt %>%
mutate(GRP = accumulate(lag(x, default = 0),.init =1, ~ if(.y != 1) .x else .x+1)[-1])
#> # A tibble: 10 x 3
#> x grp GRP
#> <int> <fct> <dbl>
#> 1 0 A 1
#> 2 0 A 1
#> 3 1 A 1
#> 4 0 B 2
#> 5 0 B 2
#> 6 0 B 2
#> 7 1 B 2
#> 8 1 C 3
#> 9 0 D 4
#> 10 1 D 4
Created on 2021-06-13 by the reprex package (v2.0.0)

filter all rows smaller than x with all following values also smaller than x

I am looking for a concise way to filter a data.frame for all rows smaller than a value x with all following values also smaller than x. I found a way but it is somehwat verbose. I tried to do it with dplyr::cumall and cumany, but was not able to figure it out.
Here is a small reprex including my actual approach. Ideally I would only have one filter line or mutate + filter, but with the current approach it takes two rounds of mutate/filter.
library(dplyr)
# Original data
tbl <- tibble(value = c(100,100,100,10,10,5,10,10,5,5,5,1,1,1,1))
# desired output:
# keep only rows, where value is smaller than 5 and ...
# no value after that is larger than 5
tbl %>%
mutate(id = row_number()) %>%
filter(value <= 5) %>%
mutate(id2 = lead(id, default = max(id) + 1) - id) %>%
filter(id2 == 1)
#> # A tibble: 7 x 3
#> value id id2
#> <dbl> <int> <dbl>
#> 1 5 9 1
#> 2 5 10 1
#> 3 5 11 1
#> 4 1 12 1
#> 5 1 13 1
#> 6 1 14 1
#> 7 1 15 1
Created on 2020-04-20 by the reprex package (v0.3.0)
You could combine cummin with a reversed reverse cummax:
tbl %>% filter(rev(cummax(rev(value))) <= 5 & cummin(value) <= 5)
# A tibble: 7 x 1
value
<dbl>
1 5
2 5
3 5
4 1
5 1
6 1
7 1
A base R option is to use subset + rle
tblout <- subset(tbl,
with(rle(value<=5 & c(0,diff(value))<=0),
rep(lengths>1 & values,lengths)))
such that
> tblout
# A tibble: 7 x 1
value
<dbl>
1 5
2 5
3 5
4 1
5 1
6 1
7 1

Add original values for columns after group by

For the dataframe below I want to add the original values for Var_x after a group_by on ID and event and a max() on quest, but I cannot get my code right. Any suggestions? By the way, in my original dataframe more than 1 column needs to be added.
df <- data.frame(ID = c(1,1,1,1,1,1,2,2,2,3,3,3),
quest = c(1,1,2,2,3,3,1,2,3,1,2,3),
event = c("A","B","A","B","A",NA,"C","D","C","D","D",NA),
VAR_X = c(2,4,3,6,3,NA,6,4,5,7,5,NA))
Code:
df %>%
group_by(ID,event) %>%
summarise(quest = max(quest))
Desired output:
ID quest event VAR_X
1 1 2 B 6
2 1 3 A 3
3 2 2 D 4
4 2 3 C 5
5 3 2 D 5
Start by omiting the na values and in the end do an inner_join with the original data set.
df %>%
na.omit() %>%
group_by(ID, event) %>%
summarise(quest = max(quest)) %>%
inner_join(df, by = c("ID", "event", "quest"))
## A tibble: 5 x 4
## Groups: ID [3]
# ID event quest VAR_X
# <dbl> <fct> <dbl> <dbl>
#1 1 A 3 3
#2 1 B 2 6
#3 2 C 3 5
#4 2 D 2 4
#5 3 D 2 5
df %>%
drop_na() %>% # remove if necessary ..
group_by(ID, event) %>%
filter(quest == max(quest)) %>%
ungroup()
# A tibble: 5 x 4
# ID quest event VAR_X
#<dbl> <dbl> <chr> <dbl>
# 1 1 2 B 6
# 2 1 3 A 3
# 3 2 2 D 4
# 4 2 3 C 5
# 5 3 2 D 5

dplyr: divide all values in group by group's first value

My df looks something like this:
ID Obs Value
1 1 26
1 2 13
1 3 52
2 1 1,5
2 2 30
Using dplyr, I to add the additional column Col, which is the result of a division of all values in the column value by the group's first value in that column.
ID Obs Value Col
1 1 26 1
1 2 13 0,5
1 3 52 2
2 1 1,5 1
2 2 30 20
How do I do that?
After grouping by 'ID', use mutate to create a new column by dividing the 'Value' by the first of 'Value'
library(dplyr)
df1 %>%
group_by(ID) %>%
mutate(Col = Value/first(Value))
If the first 'Value' is 0 and we don't want to use it, then subset the 'Value' with a logical expression and then take the first of that
df1 %>%
group_by(ID) %>%
mutate(Col = Value/first(Value[Value != 0]))
Or in base R
df1$Col <- with(df1, Value/ave(Value, ID, FUN = head, 1))
NOTE: The comma in 'Value' suggests it is a character column. In that case, it should be first changed to decimal (.) if that is the case, convert to nunmeric and then do the division. It can be done while reading the data
Or, without creating an additional column:
library(tidyverse)
df = data.frame(ID=c(1,1,1,2,2), Obs=c(1,2,3,1,2), Value=c(26, 13, 52, 1.5, 30))
df %>%
group_by(ID) %>%
mutate_at('Value', ~./first(.))
#> # A tibble: 5 x 3
#> # Groups: ID [2]
#> ID Obs Value
#> <dbl> <dbl> <dbl>
#> 1 1 1 1
#> 2 1 2 0.5
#> 3 1 3 2
#> 4 2 1 1
#> 5 2 2 20
### OR ###
df %>%
group_by(ID) %>%
mutate_at('Value', function(x) x/first(x))
#> # A tibble: 5 x 3
#> # Groups: ID [2]
#> ID Obs Value
#> <dbl> <dbl> <dbl>
#> 1 1 1 1
#> 2 1 2 0.5
#> 3 1 3 2
#> 4 2 1 1
#> 5 2 2 20
Created on 2020-01-04 by the reprex package (v0.3.0)

R dplyr - select values from one column based on position of a specific value in another column

I am working with gait-cycle data. I have 8 events marked for each id and gait trial. The values "LFCH" and "RFCH" occurs twice in each trial, as these represent the beginning and the end of the gait cycles from left and right leg.
Sample Data Frame:
df <- data.frame(ID = rep(1:5, each = 16),
Gait_nr = rep(1:2, each = 8, times=5),
Frame = rep(c(1,5,7,9,10,15,22,25), times = 10),
Marks = rep(c("LFCH", "LHL", "RFCH", "LTO", "RHL", "LFCH", "RTO", "RFCH"), times =10)
head(df,8)
ID Gait_nr Frame Marks
1 1 1 1 LFCH
2 1 1 5 LHL
3 1 1 7 RFCH
4 1 1 9 LTO
5 1 1 10 RHL
6 1 1 15 LFCH
7 1 1 22 RTO
8 1 1 25 RFCH
I wold like to create something like
Total_gait_left = Frame[The last time Marks == "LFCH"] - Frame[The first time Marks == "LFCH"]
My current code solves the problem, but depends on the position of the Frame values rather than actual values in Marks. Any individual not following the normal gait pattern will have wrong values produced by the code.
library(tidyverse)
l <- df %>% group_by(ID, Gait_nr) %>% filter(grepl("L.+", Marks)) %>%
summarize(Total_gait = Frame[4] - Frame[1],
Side = "left")
r <- df %>% group_by(ID, Gait_nr) %>% filter(grepl("R.+", Marks)) %>%
summarize(Total_gait = Frame[4] - Frame[1],
Side = "right")
val <- union(l,r, by=c("ID", "Gait_nr", "Side")) %>% arrange(ID, Gait_nr, Side)
Can you help me make my code more stable by helping me change e.g. Frame[4] to something like Frame[Marks=="LFCH" the last time ]?
If both LFCH and RFCH happen exactly twice, you can filter and then use diff in summarize:
df %>%
group_by(ID, Gait_nr) %>%
summarise(
left = diff(Frame[Marks == 'LFCH']),
right = diff(Frame[Marks == 'RFCH'])
)
# A tibble: 10 x 4
# Groups: ID [?]
# ID Gait_nr left right
# <int> <int> <dbl> <dbl>
# 1 1 1 14 18
# 2 1 2 14 18
# 3 2 1 14 18
# 4 2 2 14 18
# 5 3 1 14 18
# 6 3 2 14 18
# 7 4 1 14 18
# 8 4 2 14 18
# 9 5 1 14 18
#10 5 2 14 18
We can use first and last from the dplyr package.
library(dplyr)
df2 <- df %>%
filter(Marks %in% "LFCH") %>%
group_by(ID, Gait_nr) %>%
summarise(Total_gait = last(Frame) - first(Frame)) %>%
ungroup()
df2
# # A tibble: 10 x 3
# ID Gait_nr Total_gait
# <int> <int> <dbl>
# 1 1 1 14
# 2 1 2 14
# 3 2 1 14
# 4 2 2 14
# 5 3 1 14
# 6 3 2 14
# 7 4 1 14
# 8 4 2 14
# 9 5 1 14
# 10 5 2 14

Resources