I have a df looking like this:
Department ID Category Category.ID
NA NA NA NA
Sales 101 2 4
Sales 101 2 4
NA NA NA NA
Sales 101 2 4
Sales 101 2 4
NA NA NA NA
Sales 101 2 4
Sales 101 2 4
df = data.frame(Department = rep(c(NA, 'Sales', 'Sales'), times = 3),
ID = rep(c(NA, 101, 101), times = 3),
Category.Department = rep(c(NA, 2, 2), times = 3),
Category.ID = rep(c(NA, 4, 4), times = 3), stringsAsFactors = FALSE)
And I would like to have an output like this, where in only one column I can have the Department and ID and in another one, the Category. The NA in each column it is important to separate the groups.
New.Col Category
NA NA
Sales 2
101 4
NA NA
Sales 2
101 4
NA NA
Sales 2
101 4
So far I tried with transpose, sapply and a function but it has not worked as I expected. Any suggestions in base?
Can't accept an accept without true expected output.
df$group <- rep(1:3, times = 3)
df2 <- reshape(df[df$group != 3,], direction = "long", varying = list(New.col = c(1,2), Category = c(3,4)),
idvar = "id", v.names = c("New.col", "Category"))
df3 <- df2[order(df2$id),]
df3[!(df3$time == 1 & df3$group == 1), c(3,4)]
New.col Category
1.2 <NA> NA
2.1 Sales 2
2.2 101 4
3.2 <NA> NA
4.1 Sales 2
4.2 101 4
5.2 <NA> NA
6.1 Sales 2
6.2 101 4
Here is a different approach than casting to long format, which relies in coalesce. In addition, I created a group variable and removed the NA rows as they will not serve a purpose in your analysis, i.e.
library(tidyverse)
df %>%
group_by(grp = cumsum(rowSums(is.na(.)) == ncol(.))) %>%
mutate_at(vars(contains('ID')), funs(lag)) %>%
mutate_at(vars(contains('Department')), funs(lead)) %>%
mutate(new.col = coalesce(Department, as.character(ID)),
category = coalesce(Category.Department, Category.ID)) %>%
select(grp, new.col, category) %>%
distinct()
which gives,
# A tibble: 6 x 3
# Groups: grp [3]
grp new.col category
<int> <chr> <dbl>
1 1 Sales 2
2 1 101 4
3 2 Sales 2
4 2 101 4
5 3 Sales 2
6 3 101 4
Related
DATA = data.frame(STUDENT = c(1,1,1,2,2,2,3,3,4,4),
SCORE = c(6,4,8,10,9,0,2,3,3,7),
CLASS = c('A', 'B', 'C', 'A', 'B', 'C', 'B', 'C', 'A', 'B'),
WANT = c(NA, NA, 2, NA, NA, -10, NA, NA, NA, NA))
I have DATA and wish to create 'WANT' which is calculate by:
For each STUDENT, find the SCORE where SCORE equals to SCORE(CLASS = C) - SCORE(CLASS = A)
EX: SCORE(STUDENT = 1, CLASS = C) - SCORE(STUDENT = 1, CLASS = A) = 8-6=2
Assuming at most one 'C' and 'A' CLASS per each 'STUDENT', just subset the 'SCORE' where the CLASS value is 'C', 'A', do the subtraction and assign the value only to position where CLASS is 'C' by making all other positions to NA (after grouping by 'STUDENT')
library(dplyr)
DATA <- DATA %>%
group_by(STUDENT) %>%
mutate(WANT2 = (SCORE[CLASS == 'C'][1] - SCORE[CLASS == 'A'][1]) *
NA^(CLASS != "C")) %>%
ungroup
-output
# A tibble: 10 × 5
STUDENT SCORE CLASS WANT WANT2
<dbl> <dbl> <chr> <dbl> <dbl>
1 1 6 A NA NA
2 1 4 B NA NA
3 1 8 C 2 2
4 2 10 A NA NA
5 2 9 B NA NA
6 2 0 C -10 -10
7 3 2 B NA NA
8 3 3 C NA NA
9 4 3 A NA NA
10 4 7 B NA NA
Here is a solution with the data organized in a wider format first, then a longer format below. This solution works regardless of the order of the "CLASS" column (for instance, if there is one instance in which the CLASS order is CBA or BCA instead os ABC, this solution will work).
Solution
library(dplyr)
library(tidyr)
wider <- DATA %>% select(-WANT) %>%
pivot_wider( names_from = "CLASS", values_from = "SCORE") %>%
rowwise() %>%
mutate(WANT = C-A) %>%
ungroup()
output wider
# A tibble: 4 × 5
STUDENT A B C WANT
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1 6 4 8 2
2 2 10 9 0 -10
3 3 NA 2 3 NA
4 4 3 7 NA NA
If you really want like your output example, then we can reorganize the wider data this way:
Reorganizing wider to long format
wider %>%
pivot_longer(A:C, values_to = "SCORE", names_to = "CLASS") %>%
relocate(WANT, .after = SCORE) %>%
mutate(WANT = if_else(CLASS == "C", WANT, NA_real_))
Final Output
# A tibble: 12 × 4
STUDENT CLASS SCORE WANT
<dbl> <chr> <dbl> <dbl>
1 1 A 6 NA
2 1 B 4 NA
3 1 C 8 2
4 2 A 10 NA
5 2 B 9 NA
6 2 C 0 -10
7 3 A NA NA
8 3 B 2 NA
9 3 C 3 NA
10 4 A 3 NA
11 4 B 7 NA
12 4 C NA NA
lets say I have the following data frame:
dt <- data.frame(id= c(1),
parameter= c("a","b","c"),
start_day = c(1,8,4),
end_day = c(16,NA,30))
I need to combine start_day and end_day columns (lets call the new column as day) such that I reserve all the other columns. Also I need to create another column that indicates if each row is showing start_day or end_day. To clarify, I am looking to create the following data frame
I am creating the above data frame using the following code:
dt1 <- subset(dt, select = -c(end_day))
dt1 <- dt1 %>% rename(day = start_day)
dt1$start <- 1
dt2 <- subset(dt, select = -c(start_day))
dt2 <- dt2 %>% rename(day = end_day)
dt2$end <- 1
dt <- bind_rows(dt1, dt2)
dt <- dt[order(dt$id, dt$parameter),]
Although my code works, but I am not happy with my solution. I am certain that there is a better and cleaner way to do that. I would appreciate any input on better alternatives of tackling this problem.
(tidyr::pivot_longer(dt, cols = c(start_day, end_day), values_to = "day")
|> dplyr::mutate(start = ifelse(name == "start_day", 1, NA),
end = ifelse(name == "end_day", 1, NA))
)
Result:
# A tibble: 6 × 6
id parameter name day start end
<dbl> <chr> <chr> <dbl> <dbl> <dbl>
1 1 a start_day 1 1 NA
2 1 a end_day 16 NA 1
3 1 b start_day 8 1 NA
4 1 b end_day NA NA 1
5 1 c start_day 4 1 NA
6 1 c end_day 30 NA 1
You could get rid of the name column, but maybe it would be more useful than your new start/end columns?
using base R (faster than data.table up to ~300 rows; faster than tidyr up to ~1k rows) :
cbind(dt[1:2], day = c(dt$start_day,dt$end_day)) |>
(\(x) x[order(x$id, x$parameter),])() |>
(`[[<-`)("start", value = c(1, NA)) |>
(`[[<-`)("end", value = c(NA, 1))
id parameter day start end
1 1 a 1 1 NA
4 1 a 16 NA 1
2 1 b 8 1 NA
5 1 b NA NA 1
3 1 c 4 1 NA
6 1 c 30 NA 1
using the data.table package (faster than tidyr up to ~500k rows) :
dt <- as.data.table(dt)
dt[,.(day = c(start_day, end_day),
start = rep(c(1, NA), .N),
end = rep(c(NA, 1), .N)),
by = .(id, parameter)]
id parameter day start end
1: 1 a 1 1 NA
2: 1 a 16 NA 1
3: 1 b 8 1 NA
4: 1 b NA NA 1
5: 1 c 4 1 NA
6: 1 c 30 NA 1
I have the following data
df <- tibble(Type=c(1,2,2,1,1,2),ID=c(6,4,3,2,1,5))
Type ID
1 6
2 4
2 3
1 2
1 1
2 5
For each of the type 2 rows, I want to find the IDs of the type 1 rows just below and above them. For the above dataset, the output will be:
Type ID IDabove IDbelow
1 6 NA NA
2 4 6 2
2 3 6 2
1 2 NA NA
1 1 NA NA
2 5 1 NA
Naively, I can write a for loop to achieve this, but that would be too time consuming for the dataset I am dealing with.
One approach using dplyr lead,lag to get next and previous value respectively and data.table's rleid to create groups of consecutive Type values.
library(dplyr)
library(data.table)
df %>%
mutate(IDabove = ifelse(Type == 2, lag(ID), NA),
IDbelow = ifelse(Type == 2, lead(ID), NA),
grp = rleid(Type)) %>%
group_by(grp) %>%
mutate(IDabove = first(IDabove),
IDbelow = last(IDbelow)) %>%
ungroup() %>%
select(-grp)
# Type ID IDabove IDbelow
# <dbl> <dbl> <dbl> <dbl>
#1 1 6 NA NA
#2 2 4 6 2
#3 2 3 6 2
#4 1 2 NA NA
#5 1 1 NA NA
#6 2 5 1 NA
A dplyr only solution:
You could create your own rleid function then apply the logic provided by Ronak(Many thanks. Upvoted).
library(dplyr)
my_func <- function(x) {
x <- rle(x)$lengths
rep(seq_along(x), times=x)
}
# this part is the same as provided by Ronak.
df %>%
mutate(IDabove = ifelse(Type == 2, lag(ID), NA),
IDbelow = ifelse(Type == 2, lead(ID), NA),
grp = my_func(Type)) %>%
group_by(grp) %>%
mutate(IDabove = first(IDabove),
IDbelow = last(IDbelow)) %>%
ungroup() %>%
select(-grp)
Output:
Type ID IDabove IDbelow
<dbl> <dbl> <dbl> <dbl>
1 1 6 NA NA
2 2 4 6 2
3 2 3 6 2
4 1 2 NA NA
5 1 1 NA NA
6 2 5 1 NA
I have the following data frame in R
df1 <- data.frame(
"ID" = c("A", "B", "A", "B"),
"Value" = c(1, 2, 5, 5),
"freq" = c(1, 3, 5, 3)
)
I wish to obtain the following data frame
Value freq ID
1 1 A
2 NA A
3 NA A
4 NA A
5 1 A
1 NA B
2 2 B
3 NA B
4 NA B
5 5 B
I have tried the following code
library(tidyverse)
df_new <- bind_cols(df1 %>%
select(Value, freq, ID) %>%
complete(., expand(.,
Value = min(df1$Value):max(df1$Value))),)
I am getting the following output
Value freq ID
<dbl> <dbl> <fct>
1 1 A
2 3 B
3 NA NA
4 NA NA
5 5 A
5 3 B
I request someone to help me.
Using tidyr::full_seq we can find the full version of Value but nesting(full_seq(Value,1) will return an error:
Error: by can't contain join column full_seq(Value, 1) which is missing from RHS
so we need to add a name, hence nesting(Value=full_seq(Value,1)
library(tidyr)
df1 %>% complete(ID, nesting(Value=full_seq(Value,1)))
# A tibble: 10 x 3
ID Value freq
<fct> <dbl> <dbl>
1 A 1. 1.
2 A 2. NA
3 A 3. NA
4 A 4. NA
5 A 5. 5.
6 B 1. NA
7 B 2. 3.
8 B 3. NA
9 B 4. NA
10 B 5. 3.
Using data.table:
library(data.table)
setDT(df1)
setkey(df1, ID, Value)
df1[CJ(ID = c("A", "B"), Value = 1:5)]
ID Value freq
1: A 1 1
2: A 2 NA
3: A 3 NA
4: A 4 NA
5: A 5 5
6: B 1 NA
7: B 2 3
8: B 3 NA
9: B 4 NA
10: B 5 3
Would the following approach work for you?
with(data = df1,
expr = {
data.frame(Value = rep(wrapr::seqi(min(Value), max(Value)), length(unique(ID))),
ID = unique(ID))
}) %>%
left_join(y = df1,
by = c("ID" = "ID", "Value" = "Value")) %>%
arrange(ID, Value)
Results
Value ID freq
1 1 A 1
2 2 A NA
3 3 A NA
4 4 A NA
5 5 A 5
6 1 B NA
7 2 B 3
8 3 B NA
9 4 B NA
10 5 B 3
Comments
If I'm following your example correctly, your ID group takes values from 1 to 5. If this is the case, my approach would be to generate that reading unique combinations of both from the original data frame.
The only variable that is carried from the original data frame is freq that may / may not be available for a given par ID-Value. I would join that variable via left_join (as you seem to like tidyverse)
In your example, you have freq variable with values 1,3,5 but then in the example you list 1,2,5? In my example, I took original freq and left join it. You can modify it further using normal dplyr pipeline, if this is something you intended to do.
I work with a data set of customers purchase baskets. Here is a sample of it:
basket item quant
1 1 B 1
2 1 A 2
3 1 C 1
4 2 A 1
5 2 C 1
6 3 A 2
7 4 B 1
8 4 C 1
Here is the code for reproducing it:
input <- data.frame(
basket = c(1,1,1,2,2,3,4,4),
item = c("B","A","C","A","C","A","B","C"),
quant=c(1,2,1,1,1,2,1,1)
)
So in the first basket there are three items with specified quantities. I have a custom function that only works with a specific format of input; We define a maximum basket size. Let's say it is 5. Now the input to that function should be like this:
basket item_1 item_2 item_3 item_4 item_5
1 1 B A A C <NA>
2 2 A C <NA> <NA> <NA>
3 3 A A <NA> <NA> <NA>
4 4 B C <NA> <NA> <NA>
I have been trying to do it using dplyr and summarise but have had no luck.
Any help would be appreciated!
Another possible solution:
library(dplyr)
library(tidyr)
input[rep(1:nrow(input), input$quant),] %>%
group_by(basket) %>%
mutate(item2 = paste0('item_', row_number())) %>%
complete(item2 = paste0('item_', 1:5)) %>%
select(-quant) %>%
spread(item2, item)
which gives:
# A tibble: 4 x 6
basket item_1 item_2 item_3 item_4 item_5
<dbl> <fct> <fct> <fct> <fct> <fct>
1 1. B A A C NA
2 2. A C NA NA NA
3 3. A A NA NA NA
4 4. B C NA NA NA
Using the same logic, but with the data.table-package:
library(data.table)
setDT(input)
input[input[, rep(.I, quant)]
][, .(basket, item, item2 = paste0('item_', rowid(basket)))
][CJ(basket = basket, item2 = paste0('item_', 1:5), unique = TRUE)
, on = .(basket, item2)
][, dcast(.SD, basket ~ item2, value.var = 'item')]
Here is an idea via tidyverse. The tricks here are to replicate your rows based on quant, then remove the quant variable so it doesn't mess with your reshaping to wide data frame. After that you create a new variable that will deal with duplicates and of course finally spread to get the desired wide data frame.
library(tidyverse)
df[rep(rownames(df), df$quant),] %>%
select(-quant) %>%
group_by(basket) %>%
mutate(new = paste0('item_', row_number())) %>%
spread(new, item)
which gives,
# A tibble: 4 x 5
# Groups: basket [4]
basket item_1 item_2 item_3 item_4
<dbl> <fct> <fct> <fct> <fct>
1 1. B A A C
2 2. A C NA NA
3 3. A A NA NA
4 4. B C NA NA