Messy string column to wide format - r

I have the following data set:
Input
date string value
2021-01-01 a=uk_b=goo1_c=brandA_d=phrase_d1 for pedro 2020 20
2021-02-01 a=us_b=goo2_c=brandB_d=phrase_d2 for peter 2020 30
2021-01-15 a=ca_b=goo2_c=brandC_e=102331 40
2022-01-15 2 0
I need to create a wide data frame based on values in string (see output below).I have hundreds of names, this is just a reproducible example.
Desired output
date a b c d e value 2
2021-01-01 uk goo1 brandA phrase_d1 for pedro 2020 NA 20 NA
2021-02-01 us goo2 brandB phrase_d2 for peter 2020 NA 30 NA
2021-01-15 ca goo2 brandC NA 102331 40 NA
2022-01-15 NA NA NA NA NA 0 NA
What would be a neat solution? I'm thinking of a combination of reshape and sub probably will take care of it.
Data
data = data.frame(date =c("2021-01-01","2021-02-01","2021-01-15","2022-01-15"),
string = c("a=uk_b=goo1_c=brandA_d=phrase_d1 for pedro 2020",
"a=us_b=goo2_c=brandB_d=phrase_d2 for peter 2020",
"a=ca_b=goo2_c=brandC_e=102331",2),
value = c(20,30,40,0))

Another possible solution:
library(tidyverse)
data %>%
separate_rows(string, sep="_(?!d\\d)") %>%
separate(string, into=c("n1", "n2"), sep = "=", fill = "right") %>%
pivot_wider(id_cols = c(date, value), names_from = n1, values_from = n2)
#> # A tibble: 3 × 7
#> date value a b c d e
#> <chr> <dbl> <chr> <chr> <chr> <chr> <chr>
#> 1 2021-01-01 20 uk goo1 brandA phrase_d1 for pedro 2020 <NA>
#> 2 2021-02-01 30 us goo2 brandB phrase_d2 for peter 2020 <NA>
#> 3 2021-01-15 40 ca goo2 brandC <NA> 102331

#PaulS's solution is more succinct than mine, but requires that the only underscores in the strings that are to be printed in the variables have a d and then a number behind them. If there are underscores that have other unknown patterns following them, the solution will break. Here's a quick example:
dat <- tibble::tribble(
~date, ~string, ~value,
"2021-01-01", "abc=uk_def=goo1_ghi=brandA_jkl=phrase_dx for pedro 2020", 20,
"2021-02-01", "abc=us_def=goo2_ghi=brandB_jkl=phrase_d2 for peter 2020", 30,
"2021-01-015", "abc=ca_def=goo2_ghi=brandC_mno=102331", 40)
library(stringr)
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(tidyr)
dat %>%
separate_rows(string, sep="_(?!d\\d)") %>%
separate(string, into=c("n1", "n2"), sep = "=", fill = "right") %>%
pivot_wider(id_cols = c(date, value), names_from = n1, values_from = n2)
#> # A tibble: 3 × 8
#> date value abc def ghi jkl `dx for pedro …` mno
#> <chr> <dbl> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 2021-01-01 20 uk goo1 brandA phrase <NA> <NA>
#> 2 2021-02-01 30 us goo2 brandB phrase_d2 for pet… <NA> <NA>
#> 3 2021-01-015 40 ca goo2 brandC <NA> <NA> 1023…
My solution is a bit more complicated, but I think it works in a wider variety of cases:
make_df <- function(string){
str <- str_split(string, "=", simplify=TRUE)
if(length(str) == 1){
nm <- str[1]
str <- list(NA)
names(str) <- nm
}
if(length(str) > 1){
nm <- c(str[1], gsub(".*_(.*?)$", "\\1", str[2:(length(str)-1)]))
str <- str[-1]
str <- gsub(paste0("_", nm, collapse="|"), "", str)
str <- as.list(str)
names(str) <- nm
}
do.call(data.frame, str)
}
dat %>%
rowwise() %>%
mutate(out = make_df(string)) %>%
unnest(out) %>%
select(-string)
#> # A tibble: 4 × 8
#> date value abc def ghi jkl mno X2
#> <chr> <dbl> <chr> <chr> <chr> <chr> <chr> <lgl>
#> 1 2021-01-01 20 uk goo1 brandA phrase_dx for pedro 2020 <NA> NA
#> 2 2021-02-01 30 us goo2 brandB phrase_d2 for peter 2020 <NA> NA
#> 3 2021-01-015 40 ca goo2 brandC <NA> 102331 NA
#> 4 2021-91-15 0 <NA> <NA> <NA> <NA> <NA> NA
Created on 2022-04-08 by the reprex package (v2.0.1)
If the strings with underscores in them are as regular as in the example, the #PaulS's solution is better. Otherwise, this one might work.

Related

Convert any element that does not start with a specific string to NA

I have a large data frame that looks like df2.
I want to convert any element across columns code1, code2 that does not start with
AT to NA.
library(tidyverse)
df2 <- tibble(type=c("Jeep", "4x4", "convertible"),
code1=c("ATG1",NA, "ATG2"),
code2=c("random", "ATG3", "xyz"))
df2
#> # A tibble: 3 × 3
#> type code1 code2
#> <chr> <chr> <chr>
#> 1 Jeep ATG1 random
#> 2 4x4 <NA> ATG3
#> 3 convertible ATG2 xyz
Created on 2022-09-29 with reprex v2.0.2
I want my data to look like this
#> type code1 code2
#>
#> 1 Jeep ATG1 NA
#> 2 4x4 ATG3
#> 3 convertible ATG2 NA
You could do
df2 %>%
mutate(across(code1:code2, ~ifelse(substr(.x, 1, 2) == 'AT', .x, NA)))
#> # A tibble: 3 x 3
#> type code1 code2
#> <chr> <chr> <chr>
#> 1 Jeep ATG1 NA
#> 2 4x4 NA ATG3
#> 3 convertible ATG2 NA
With replace and grepl:
df2 %>%
mutate(across(starts_with("code"), ~ replace(.x, !grepl("^AT", .x), NA)))
Using case_when
library(dplyr)
library(stringr)
df2 %>%
mutate(across(starts_with('code'), ~ case_when(str_detect(.x, '^AT')~ .x)))
-output
# A tibble: 3 × 3
type code1 code2
<chr> <chr> <chr>
1 Jeep ATG1 <NA>
2 4x4 <NA> ATG3
3 convertible ATG2 <NA>

R Rolling Counts of Additions, or Deletions, to a List

df <- data.frame(date = as.Date(c(rep("2022-01-01", 3),
rep("2022-02-01", 3),
rep("2022-03-01", 4))),
flavor = c("Almond", "Apple", "Apricot",
"Almond", "Maple", "Mint",
"Apricot", "Pecan", "Praline", "Pumpkin"))
#> date flavor
#> 1 2022-01-01 Almond
#> 2 2022-01-01 Apple
#> 3 2022-01-01 Apricot
#> 4 2022-02-01 Almond
#> 5 2022-02-01 Maple
#> 6 2022-02-01 Mint
#> 7 2022-03-01 Apricot
#> 8 2022-03-01 Pecan
#> 9 2022-03-01 Praline
#> 10 2022-03-01 Pumpkin
The R data frame above tracks ice cream flavors at an ice cream shop, month to month. In the month of February there were two flavors added that were not present in the month of January (Maple, Mint), and two flavors removed (Apple, Apricot) that were present in January. In the month of March there were four flavors added that were not present in the month of February (Apricot, Pecan, Praline, Pumpkin), and three flavors removed (Almond, Maple, Mint) that were present in February.
#> date flavors.added flavors.removed
#> 1 2022-01-01 <NA> <NA>
#> 2 2022-02-01 2 2
#> 3 2022-03-01 4 3
How do I write an R script to calculate the summary data frame above? That is to say I want a rolling count of ice cream flavors that were added per month that were not present in the prior month, and also a count of flavors removed per month that were present in the prior month.
Using data.table:
library(data.table)
df2 = setDT(df)[, .(flavors = list(flavor)), date]
for (i in 2:nrow(df2))
set(
df2, i = i,
j = c('flavors_added', 'flavors_removed'),
list(length(setdiff(df2$flavors[[i]], df2$flavors[[i-1]])), length(setdiff(df2$flavors[[i-1]], df2$flavors[[i]])))
)
df2
# date flavors flavors_added flavors_removed
# <Date> <list> <int> <int>
# 1: 2022-01-01 Almond,Apple,Apricot NA NA
# 2: 2022-02-01 Almond,Maple,Mint 2 2
# 3: 2022-03-01 Apricot,Pecan,Praline,Pumpkin 4 3
In dplyr:
library(dplyr)
df %>%
group_by(date) %>%
summarise(flavors = list(flavor)) %>%
mutate(flavors.added = lengths(mapply(setdiff, flavors, lag(flavors))),
flavors.removed = lengths(mapply(setdiff, lag(flavors), flavors)))
output
# A tibble: 3 × 4
date flavors flavors.added flavors.removed
<date> <list> <int> <int>
1 2022-01-01 <chr [3]> 3 0
2 2022-02-01 <chr [3]> 2 2
3 2022-03-01 <chr [4]> 4 3

match data frames based on multiple columns in R

I have two huge datasets that look like this.
there is one fruit from df2, PEACH, which is missing for any reason from df1.
I want to add in df1 the fruits that are missing.
library(tidyverse)
df1 <- tibble(central_fruit=c("ananas","apple"),
fruits=c("ananas,anan,anannas",("apple,appl,appless")),
counts=c("100,10,1","50,20,2"))
df1
#> # A tibble: 2 × 3
#> central_fruit fruits counts
#> <chr> <chr> <chr>
#> 1 ananas ananas,anan,anannas 100,10,1
#> 2 apple apple,appl,appless 50,20,2
df2 <- tibble(fruit=c("ananas","anan","anannas","apple","appl","appless","PEACH"),
counts=c(100,10,1,50,20,2,1000))
df2
#> # A tibble: 7 × 2
#> fruit counts
#> <chr> <dbl>
#> 1 ananas 100
#> 2 anan 10
#> 3 anannas 1
#> 4 apple 50
#> 5 appl 20
#> 6 appless 2
#> 7 PEACH 1000
Created on 2022-03-20 by the reprex package (v2.0.1)
I want my data to look like this
df1
central_fruit fruits counts
<chr> <chr> <chr>
1 ananas ananas,anan,anannas 100,10,1
2 apple apple,appl,appless 50,20,2
3 PEACH NA 1000
any help or advice are highly appreciated
Please find below one possible data.table approach.
Reprex
Code
library(tidyverse) # to read your tibbles
library(data.table)
setDT(df1)
setDT(df2)
df1[df2, on = .(central_fruit = fruit)
][, `:=` (counts = fcoalesce(counts, as.character(i.counts)), i.counts = NULL)
][central_fruit %chin% c(df1$central_fruit, setdiff(df2$fruit, unlist(strsplit(df1$fruit, ","))))][]
Output
#> central_fruit fruits counts
#> 1: ananas ananas,anan,anannas 100,10,1
#> 2: apple apple,appl,appless 50,20,2
#> 3: PEACH <NA> 1000
Created on 2022-03-20 by the reprex package (v2.0.1)
You can just take the set of fruits present in your df1 and use them to filter df2, then bind them together.
library(tidyverse)
present <- df1$fruits |>
str_split(",") |>
unlist()
df2 |>
rename(central_fruit = fruit) |>
filter(! central_fruit %in% present) |>
mutate(counts = as.character(counts)) |>
bind_rows(df1)
#> # A tibble: 3 × 3
#> central_fruit counts fruits
#> <chr> <chr> <chr>
#> 1 PEACH 1000 <NA>
#> 2 ananas 100,10,1 ananas,anan,anannas
#> 3 apple 50,20,2 apple,appl,appless
You may get the dataset in a long format by splitting on comma fruits and counts variable, do a full_join with df2, adjust the NA values and for each central_fruit collapse the values.
library(dplyr)
library(tidyr)
df1 %>%
separate_rows(fruits, counts, convert = TRUE) %>%
full_join(df2, by = c('fruits' = 'fruit')) %>%
transmute(central_fruit = ifelse(is.na(central_fruit), fruits, central_fruit),
fruits = ifelse(is.na(counts.x), NA, fruits),
counts = coalesce(counts.x, counts.y)) %>%
group_by(central_fruit) %>%
summarise(across(.fns = toString))
# central_fruit fruits counts
# <chr> <chr> <chr>
#1 ananas ananas, anan, anannas 100, 10, 1
#2 apple apple, appl, appless 50, 20, 2
#3 PEACH NA 1000

R: Convert monthly data into daily data for panel data

I have the following data:
5 Products with a monthly rating from 2018-08 to 2018-12
Now with the help of R programming I would like to convert the monthly data into daily data and to have panel data.The monthly rating for each product will also be the rating for each day in the respective month.
So, that the new data will look like:
(with the first column being the product, the second column the date and the third column the rating)
A 2018-08-01 1
A 2018-08-02 1
A 2018-08-03 1
A 2018-08-04 1
... so on
A 2018-09-01 1
A 2018-09-02 1
...so on
A 2018-12-31 1
B 2018-08-01 3
B 2018-08-02 3
... so on
E 2018-12-31 3
library(tidyverse)
library(lubridate)
#>
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:base':
#>
#> date, intersect, setdiff, union
# example data
data <- tribble(
~Product, ~`Product Rating 2018-08`, ~`Product Rating 2018-10`,
"A", 1, 1,
"B", 3, 3,
)
data2 <-
data %>%
pivot_longer(-Product) %>%
mutate(
name = name %>% str_extract("[0-9-]+$") %>% paste0("-01") %>% as.Date()
)
seq(as.Date("2018-08-01"), as.Date("2018-12-31"), by = "days") %>%
tibble(date = .) %>%
# left join on year and month
expand_grid(data2) %>%
filter(month(date) == month(name) & year(date) == year(name)) %>%
select(Product, date, value)
#> # A tibble: 124 × 3
#> Product date value
#> <chr> <date> <dbl>
#> 1 A 2018-08-01 1
#> 2 B 2018-08-01 3
#> 3 A 2018-08-02 1
#> 4 B 2018-08-02 3
#> 5 A 2018-08-03 1
#> 6 B 2018-08-03 3
#> 7 A 2018-08-04 1
#> 8 B 2018-08-04 3
#> 9 A 2018-08-05 1
#> 10 B 2018-08-05 3
#> # … with 114 more rows
Created on 2022-03-09 by the reprex package (v2.0.0)

Spread a data.frame with repetitive column

I have a large data.frame that I am trying to spread. A toy example looks like this.
data = data.frame(date = rep(c("2019", "2020"), 2), ticker = c("SPY", "SPY", "MSFT", "MSFT"), value = c(1, 2, 3, 4))
head(data)
date ticker value
1 2019 SPY 1
2 2020 SPY 2
3 2019 MSFT 3
4 2020 MSFT 4
I would like to spread it so the data.frame looks like this.
spread(data, key = ticker, value = value)
date MSFT SPY
1 2019 3 1
2 2020 4 2
However, when I do this on my actual data.frame, I get an error.
Error: Each row of output must be identified by a unique combination of keys.
Keys are shared for 18204 rows:
* 30341, 166871
* 30342, 166872
* 30343, 166873
* 30344, 166874
* 30345, 166875
* 30346, 166876
* 30347, 166877
* 30348, 166878
* 30349, 166879
* 30350, 166880
* 30351, 166881
* 30352, 166882
Below is a head and tail of my data.frame
head(df)
ref.date ticker weeklyReturn
<date> <chr> <dbl>
1 2008-02-01 SPY NA
2 2008-02-04 SPY NA
3 2008-02-05 SPY NA
4 2008-02-06 SPY NA
5 2008-02-07 SPY NA
6 2008-02-08 SPY -0.0478
tail(df)
ref.date ticker weeklyReturn
<date> <chr> <dbl>
1 2020-02-12 MDYV 0.00293
2 2020-02-13 MDYV 0.00917
3 2020-02-14 MDYV 0.0179
4 2020-02-18 MDYV 0.0107
5 2020-02-19 MDYV 0.00422
6 2020-02-20 MDYV 0.00347
You can use dplyr and tidyr packages. To get rid of that error, you would have to firstly sum the values for each group.
data %>%
group_by(date, ticker) %>%
summarise(value = sum(value)) %>%
pivot_wider(names_from = ticker, values_from = value)
# date MSFT SPY
# <fct> <dbl> <dbl>
# 1 2019 3 1
# 2 2020 4 2
As said in the comments, you have multiple values for same combination of date-ticker. You need to define what to do with it.
Here with a reprex:
library(tidyr)
library(dplyr)
# your data is more like:
data = data.frame(
date = c(2019, rep(c("2019", "2020"), 2)),
ticker = c("SPY", "SPY", "SPY", "MSFT", "MSFT"),
value = c(8, 1, 2, 3, 4))
# With two values for same date-ticker combination
data
#> date ticker value
#> 1 2019 SPY 8
#> 2 2019 SPY 1
#> 3 2020 SPY 2
#> 4 2019 MSFT 3
#> 5 2020 MSFT 4
# Results in error
data %>%
spread(ticker, value)
#> Error: Each row of output must be identified by a unique combination of keys.
#> Keys are shared for 2 rows:
#> * 1, 2
# New pivot_wider() Creates list-columns for duplicates
data %>%
pivot_wider(names_from = ticker, values_from = value,)
#> Warning: Values in `value` are not uniquely identified; output will contain list-cols.
#> * Use `values_fn = list(value = list)` to suppress this warning.
#> * Use `values_fn = list(value = length)` to identify where the duplicates arise
#> * Use `values_fn = list(value = summary_fun)` to summarise duplicates
#> # A tibble: 2 x 3
#> date SPY MSFT
#> <fct> <list> <list>
#> 1 2019 <dbl [2]> <dbl [1]>
#> 2 2020 <dbl [1]> <dbl [1]>
# Otherwise, decide yourself how to summarise duplicates with mean() for instance
data %>%
group_by(date, ticker) %>%
summarise(value = mean(value, na.rm = TRUE)) %>%
spread(ticker, value)
#> # A tibble: 2 x 3
#> # Groups: date [2]
#> date MSFT SPY
#> <fct> <dbl> <dbl>
#> 1 2019 3 4.5
#> 2 2020 4 2
Created on 2020-02-22 by the reprex package (v0.3.0)

Resources