Create columns based on date - r

case <- c("A","A","A","B","B","C","C","C","C")
date <- c("2022-01-01","2022-01-08","2022-06-07","2022-05-08","2022-03-06","2022-09-08","2022-09-23","2022-12-08","2022-06-05")
df <- data.frame(case,date)
I have a dataframe that looks like this:
# A tibble: 9 x 2
case date
<chr> <chr>
1 A 2022-01-01
2 A 2022-01-08
3 A 2022-06-07
4 B 2022-05-08
5 B 2022-03-06
6 C 2022-09-08
7 C 2022-09-23
8 C 2022-12-08
9 C 2022-06-05
I would like to essentially pivot_wider the rows based on date where the earliest date would become instance_1, next instance_2 and so far. I have tried the pivot_wider function but can't get the syntax right. Any help is appreciated.

We need a sequence column by 'case' and then do pivot_wider
library(tidyr)
library(dplyr)
library(data.table)
library(stringr)
df %>%
arrange(case, date) %>%
mutate(cn = str_c('instance_', rowid(case))) %>%
pivot_wider(names_from = cn, values_from = date)
-output
# A tibble: 3 × 5
case instance_1 instance_2 instance_3 instance_4
<chr> <chr> <chr> <chr> <chr>
1 A 2022-01-01 2022-01-08 2022-06-07 <NA>
2 B 2022-03-06 2022-05-08 <NA> <NA>
3 C 2022-06-05 2022-09-08 2022-09-23 2022-12-08
Or a similar option with dcast
library(data.table)
dcast(setDT(df)[order(case, date)],
case ~ paste0('instance_', rowid(case)), value.var = 'date')
-output
Key: <case>
case instance_1 instance_2 instance_3 instance_4
<char> <char> <char> <char> <char>
1: A 2022-01-01 2022-01-08 2022-06-07 <NA>
2: B 2022-03-06 2022-05-08 <NA> <NA>
3: C 2022-06-05 2022-09-08 2022-09-23 2022-12-08

Related

Convert any element that does not start with a specific string to NA

I have a large data frame that looks like df2.
I want to convert any element across columns code1, code2 that does not start with
AT to NA.
library(tidyverse)
df2 <- tibble(type=c("Jeep", "4x4", "convertible"),
code1=c("ATG1",NA, "ATG2"),
code2=c("random", "ATG3", "xyz"))
df2
#> # A tibble: 3 × 3
#> type code1 code2
#> <chr> <chr> <chr>
#> 1 Jeep ATG1 random
#> 2 4x4 <NA> ATG3
#> 3 convertible ATG2 xyz
Created on 2022-09-29 with reprex v2.0.2
I want my data to look like this
#> type code1 code2
#>
#> 1 Jeep ATG1 NA
#> 2 4x4 ATG3
#> 3 convertible ATG2 NA
You could do
df2 %>%
mutate(across(code1:code2, ~ifelse(substr(.x, 1, 2) == 'AT', .x, NA)))
#> # A tibble: 3 x 3
#> type code1 code2
#> <chr> <chr> <chr>
#> 1 Jeep ATG1 NA
#> 2 4x4 NA ATG3
#> 3 convertible ATG2 NA
With replace and grepl:
df2 %>%
mutate(across(starts_with("code"), ~ replace(.x, !grepl("^AT", .x), NA)))
Using case_when
library(dplyr)
library(stringr)
df2 %>%
mutate(across(starts_with('code'), ~ case_when(str_detect(.x, '^AT')~ .x)))
-output
# A tibble: 3 × 3
type code1 code2
<chr> <chr> <chr>
1 Jeep ATG1 <NA>
2 4x4 <NA> ATG3
3 convertible ATG2 <NA>

R Rolling Counts of Additions, or Deletions, to a List

df <- data.frame(date = as.Date(c(rep("2022-01-01", 3),
rep("2022-02-01", 3),
rep("2022-03-01", 4))),
flavor = c("Almond", "Apple", "Apricot",
"Almond", "Maple", "Mint",
"Apricot", "Pecan", "Praline", "Pumpkin"))
#> date flavor
#> 1 2022-01-01 Almond
#> 2 2022-01-01 Apple
#> 3 2022-01-01 Apricot
#> 4 2022-02-01 Almond
#> 5 2022-02-01 Maple
#> 6 2022-02-01 Mint
#> 7 2022-03-01 Apricot
#> 8 2022-03-01 Pecan
#> 9 2022-03-01 Praline
#> 10 2022-03-01 Pumpkin
The R data frame above tracks ice cream flavors at an ice cream shop, month to month. In the month of February there were two flavors added that were not present in the month of January (Maple, Mint), and two flavors removed (Apple, Apricot) that were present in January. In the month of March there were four flavors added that were not present in the month of February (Apricot, Pecan, Praline, Pumpkin), and three flavors removed (Almond, Maple, Mint) that were present in February.
#> date flavors.added flavors.removed
#> 1 2022-01-01 <NA> <NA>
#> 2 2022-02-01 2 2
#> 3 2022-03-01 4 3
How do I write an R script to calculate the summary data frame above? That is to say I want a rolling count of ice cream flavors that were added per month that were not present in the prior month, and also a count of flavors removed per month that were present in the prior month.
Using data.table:
library(data.table)
df2 = setDT(df)[, .(flavors = list(flavor)), date]
for (i in 2:nrow(df2))
set(
df2, i = i,
j = c('flavors_added', 'flavors_removed'),
list(length(setdiff(df2$flavors[[i]], df2$flavors[[i-1]])), length(setdiff(df2$flavors[[i-1]], df2$flavors[[i]])))
)
df2
# date flavors flavors_added flavors_removed
# <Date> <list> <int> <int>
# 1: 2022-01-01 Almond,Apple,Apricot NA NA
# 2: 2022-02-01 Almond,Maple,Mint 2 2
# 3: 2022-03-01 Apricot,Pecan,Praline,Pumpkin 4 3
In dplyr:
library(dplyr)
df %>%
group_by(date) %>%
summarise(flavors = list(flavor)) %>%
mutate(flavors.added = lengths(mapply(setdiff, flavors, lag(flavors))),
flavors.removed = lengths(mapply(setdiff, lag(flavors), flavors)))
output
# A tibble: 3 × 4
date flavors flavors.added flavors.removed
<date> <list> <int> <int>
1 2022-01-01 <chr [3]> 3 0
2 2022-02-01 <chr [3]> 2 2
3 2022-03-01 <chr [4]> 4 3

R: Get unique values based on criteria from 2 other columns

Hi I would like to get only 1 unique Code for each rows. To get that 1 uniqe Code the criteria should be get nearest Refresh Date that is >= Effective Date. And if there is no Refresh date that is >= Effective date then just get the nearest Resfresh Date < Effective date.
Below is my sample dataframe.
Code <- c("A","A","A", "A", "B", "B", "B", "B", "C","C","C","C")
Effective_Date <- as.Date(c("2020-08-25","2020-08-25","2020-08-25","2020-08-25","2021-12-18","2021-12-18",
"2021-12-18","2021-12-18","2021-10-15","2021-10-15","2021-10-15","2021-10-15"))
Refresh_Date <- as.Date(c("2020-09-25","2021-09-17","2022-11-25","2020-02-20","2021-12-12","2021-12-18",
"2022-01-15","2021-08-19","2021-08-20","2020-08-25","2021-09-30","2020-08-25"))
DF <- data.frame(Code,Effective_Date,Refresh_Date)
> DF
Code Effective_Date Refresh_Date
1 A 2020-08-25 2021-09-17
2 A 2020-08-25 2020-09-25
3 A 2020-08-25 2022-11-25
4 A 2020-08-25 2020-02-20
5 B 2021-12-18 2021-12-14
6 B 2021-12-18 2021-12-18
7 B 2021-12-18 2022-01-15
8 B 2021-12-18 2021-08-19
9 C 2021-10-15 2021-08-20
10 C 2021-10-15 2020-08-25
11 C 2021-10-15 2021-09-30
12 C 2021-10-15 2020-08-25
It's just like aggregating to Code and Effective Date. But get the row that has the nearest Refresh Date >= Effective Date. And if there is no Refresh Date that is >= Effective Date then just get the nearest Refresh Date < Effective Date.
Below is my desired output:
> DF_DesiredOutput
Code Effective_Date Refresh_Date
1 A 2020-08-25 2020-09-25
2 B 2021-12-18 2021-12-18
3 C 2021-10-15 2021-09-30
We can use slice on the difference of 'Refresh_Date' and 'Effective_Date', get the index of the min value, after grouping by 'Code'
library(dplyr)
DF %>%
group_by(Code) %>%
slice(which.min(abs(Refresh_Date - Effective_Date))) %>%
ungroup
-output
# A tibble: 3 × 3
Code Effective_Date Refresh_Date
<chr> <date> <date>
1 A 2020-08-25 2020-09-25
2 B 2021-12-18 2021-12-18
3 C 2021-10-15 2021-09-30
Here is an alternative approach using arrange by the absolute difference and then slice:
library(dplyr)
DF %>%
group_by(Code) %>%
arrange(abs(Refresh_Date-Effective_Date), .by_group = TRUE) %>%
slice(1)
Code Effective_Date Refresh_Date
<chr> <date> <date>
1 A 2020-08-25 2020-09-25
2 B 2021-12-18 2021-12-18
3 C 2021-10-15 2021-09-30

Messy string column to wide format

I have the following data set:
Input
date string value
2021-01-01 a=uk_b=goo1_c=brandA_d=phrase_d1 for pedro 2020 20
2021-02-01 a=us_b=goo2_c=brandB_d=phrase_d2 for peter 2020 30
2021-01-15 a=ca_b=goo2_c=brandC_e=102331 40
2022-01-15 2 0
I need to create a wide data frame based on values in string (see output below).I have hundreds of names, this is just a reproducible example.
Desired output
date a b c d e value 2
2021-01-01 uk goo1 brandA phrase_d1 for pedro 2020 NA 20 NA
2021-02-01 us goo2 brandB phrase_d2 for peter 2020 NA 30 NA
2021-01-15 ca goo2 brandC NA 102331 40 NA
2022-01-15 NA NA NA NA NA 0 NA
What would be a neat solution? I'm thinking of a combination of reshape and sub probably will take care of it.
Data
data = data.frame(date =c("2021-01-01","2021-02-01","2021-01-15","2022-01-15"),
string = c("a=uk_b=goo1_c=brandA_d=phrase_d1 for pedro 2020",
"a=us_b=goo2_c=brandB_d=phrase_d2 for peter 2020",
"a=ca_b=goo2_c=brandC_e=102331",2),
value = c(20,30,40,0))
Another possible solution:
library(tidyverse)
data %>%
separate_rows(string, sep="_(?!d\\d)") %>%
separate(string, into=c("n1", "n2"), sep = "=", fill = "right") %>%
pivot_wider(id_cols = c(date, value), names_from = n1, values_from = n2)
#> # A tibble: 3 × 7
#> date value a b c d e
#> <chr> <dbl> <chr> <chr> <chr> <chr> <chr>
#> 1 2021-01-01 20 uk goo1 brandA phrase_d1 for pedro 2020 <NA>
#> 2 2021-02-01 30 us goo2 brandB phrase_d2 for peter 2020 <NA>
#> 3 2021-01-15 40 ca goo2 brandC <NA> 102331
#PaulS's solution is more succinct than mine, but requires that the only underscores in the strings that are to be printed in the variables have a d and then a number behind them. If there are underscores that have other unknown patterns following them, the solution will break. Here's a quick example:
dat <- tibble::tribble(
~date, ~string, ~value,
"2021-01-01", "abc=uk_def=goo1_ghi=brandA_jkl=phrase_dx for pedro 2020", 20,
"2021-02-01", "abc=us_def=goo2_ghi=brandB_jkl=phrase_d2 for peter 2020", 30,
"2021-01-015", "abc=ca_def=goo2_ghi=brandC_mno=102331", 40)
library(stringr)
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(tidyr)
dat %>%
separate_rows(string, sep="_(?!d\\d)") %>%
separate(string, into=c("n1", "n2"), sep = "=", fill = "right") %>%
pivot_wider(id_cols = c(date, value), names_from = n1, values_from = n2)
#> # A tibble: 3 × 8
#> date value abc def ghi jkl `dx for pedro …` mno
#> <chr> <dbl> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 2021-01-01 20 uk goo1 brandA phrase <NA> <NA>
#> 2 2021-02-01 30 us goo2 brandB phrase_d2 for pet… <NA> <NA>
#> 3 2021-01-015 40 ca goo2 brandC <NA> <NA> 1023…
My solution is a bit more complicated, but I think it works in a wider variety of cases:
make_df <- function(string){
str <- str_split(string, "=", simplify=TRUE)
if(length(str) == 1){
nm <- str[1]
str <- list(NA)
names(str) <- nm
}
if(length(str) > 1){
nm <- c(str[1], gsub(".*_(.*?)$", "\\1", str[2:(length(str)-1)]))
str <- str[-1]
str <- gsub(paste0("_", nm, collapse="|"), "", str)
str <- as.list(str)
names(str) <- nm
}
do.call(data.frame, str)
}
dat %>%
rowwise() %>%
mutate(out = make_df(string)) %>%
unnest(out) %>%
select(-string)
#> # A tibble: 4 × 8
#> date value abc def ghi jkl mno X2
#> <chr> <dbl> <chr> <chr> <chr> <chr> <chr> <lgl>
#> 1 2021-01-01 20 uk goo1 brandA phrase_dx for pedro 2020 <NA> NA
#> 2 2021-02-01 30 us goo2 brandB phrase_d2 for peter 2020 <NA> NA
#> 3 2021-01-015 40 ca goo2 brandC <NA> 102331 NA
#> 4 2021-91-15 0 <NA> <NA> <NA> <NA> <NA> NA
Created on 2022-04-08 by the reprex package (v2.0.1)
If the strings with underscores in them are as regular as in the example, the #PaulS's solution is better. Otherwise, this one might work.

match data frames based on multiple columns in R

I have two huge datasets that look like this.
there is one fruit from df2, PEACH, which is missing for any reason from df1.
I want to add in df1 the fruits that are missing.
library(tidyverse)
df1 <- tibble(central_fruit=c("ananas","apple"),
fruits=c("ananas,anan,anannas",("apple,appl,appless")),
counts=c("100,10,1","50,20,2"))
df1
#> # A tibble: 2 × 3
#> central_fruit fruits counts
#> <chr> <chr> <chr>
#> 1 ananas ananas,anan,anannas 100,10,1
#> 2 apple apple,appl,appless 50,20,2
df2 <- tibble(fruit=c("ananas","anan","anannas","apple","appl","appless","PEACH"),
counts=c(100,10,1,50,20,2,1000))
df2
#> # A tibble: 7 × 2
#> fruit counts
#> <chr> <dbl>
#> 1 ananas 100
#> 2 anan 10
#> 3 anannas 1
#> 4 apple 50
#> 5 appl 20
#> 6 appless 2
#> 7 PEACH 1000
Created on 2022-03-20 by the reprex package (v2.0.1)
I want my data to look like this
df1
central_fruit fruits counts
<chr> <chr> <chr>
1 ananas ananas,anan,anannas 100,10,1
2 apple apple,appl,appless 50,20,2
3 PEACH NA 1000
any help or advice are highly appreciated
Please find below one possible data.table approach.
Reprex
Code
library(tidyverse) # to read your tibbles
library(data.table)
setDT(df1)
setDT(df2)
df1[df2, on = .(central_fruit = fruit)
][, `:=` (counts = fcoalesce(counts, as.character(i.counts)), i.counts = NULL)
][central_fruit %chin% c(df1$central_fruit, setdiff(df2$fruit, unlist(strsplit(df1$fruit, ","))))][]
Output
#> central_fruit fruits counts
#> 1: ananas ananas,anan,anannas 100,10,1
#> 2: apple apple,appl,appless 50,20,2
#> 3: PEACH <NA> 1000
Created on 2022-03-20 by the reprex package (v2.0.1)
You can just take the set of fruits present in your df1 and use them to filter df2, then bind them together.
library(tidyverse)
present <- df1$fruits |>
str_split(",") |>
unlist()
df2 |>
rename(central_fruit = fruit) |>
filter(! central_fruit %in% present) |>
mutate(counts = as.character(counts)) |>
bind_rows(df1)
#> # A tibble: 3 × 3
#> central_fruit counts fruits
#> <chr> <chr> <chr>
#> 1 PEACH 1000 <NA>
#> 2 ananas 100,10,1 ananas,anan,anannas
#> 3 apple 50,20,2 apple,appl,appless
You may get the dataset in a long format by splitting on comma fruits and counts variable, do a full_join with df2, adjust the NA values and for each central_fruit collapse the values.
library(dplyr)
library(tidyr)
df1 %>%
separate_rows(fruits, counts, convert = TRUE) %>%
full_join(df2, by = c('fruits' = 'fruit')) %>%
transmute(central_fruit = ifelse(is.na(central_fruit), fruits, central_fruit),
fruits = ifelse(is.na(counts.x), NA, fruits),
counts = coalesce(counts.x, counts.y)) %>%
group_by(central_fruit) %>%
summarise(across(.fns = toString))
# central_fruit fruits counts
# <chr> <chr> <chr>
#1 ananas ananas, anan, anannas 100, 10, 1
#2 apple apple, appl, appless 50, 20, 2
#3 PEACH NA 1000

Resources