Index multiple vectors into table in R - r

I have three vectors:
position <- c(13, 13, 24, 20, 24, 6, 13)
my_string_allele <- c("T>A", "T>A", "G>C", "C>A", "A>G", "A>G", "G>T")
position_ref <- c("12006", "1108", "13807", "1970", "9030", "2222", "4434")
I want to create a table (starting from the smallest position) as shown below. I want to account for the number of occurrence for each my_string_allele column for each position and have their corresponding position_ref in position_ref column. What would be the simplest way to do this?
position T>A position_ref G>C position_ref C>A position_ref A>G position_ref G>T position_ref
6 1 2222
13 2 12006, 1108 1 4434
20 1 1970
24 1 13807 1 9030

Here is a spread() method which stretches data to the wide format with mutate_all() to count the number of occurrences.
Data
library(tidyverse)
df <- data.frame(position, my_string_allele, position_ref, stringsAsFactors = F)
Code
df %>% group_by(position, my_string_allele) %>%
mutate(position_ref = paste(position_ref, collapse = ", ")) %>%
distinct() %>%
spread(my_string_allele, position_ref) %>%
mutate_all(funs(N = if_else(is.na(.), NA_integer_, lengths(str_split(., ", ")))))
Output
position `A>G` `C>A` `G>C` `G>T` `T>A` `A>G_N` `C>A_N` `G>C_N` `G>T_N` `T>A_N`
<dbl> <chr> <chr> <chr> <chr> <chr> <int> <int> <int> <int> <int>
1 6 2222 NA NA NA NA 1 NA NA NA NA
2 13 NA NA NA 4434 12006, 1108 NA NA NA 1 2
3 20 NA 1970 NA NA NA NA 1 NA NA NA
4 24 9030 NA 13807 NA NA 1 NA 1 NA NA
(You can sort the columns by their column names to get the output you show in the question.)

Full disclosure: I am adapting part of #DarrenTsai's answer with data.table to provide the number of occurrence as well (since it is missing from his answer). Using data.table:
library(data.table)
df <- data.frame(position, my_string_allele, position_ref, stringsAsFactors = F)
setDT(df)
df[, `:=`(position_ref = paste(.N, paste(position_ref, collapse = ", "))),
by = c("position", "my_string_allele")] %>%
unique(., by = c("position", "my_string_allele", "position_ref")) %>%
dcast(position ~ my_string_allele, value.var = "position_ref")
Result:
position A>G C>A G>C G>T T>A
1: 6 1 2222 <NA> <NA> <NA> <NA>
2: 13 <NA> <NA> <NA> 1 4434 2 12006, 1108
3: 20 <NA> 1 1970 <NA> <NA> <NA>
4: 24 1 9030 <NA> 1 13807 <NA> <NA>
With dplyr (largely based on #DarrenTsai's answer, should upvote his as well):
library(dplyr)
df %>% group_by(position, my_string_allele) %>%
mutate(position_ref = paste(n(), paste(position_ref, collapse = ", "))) %>%
distinct() %>%
tidyr::spread(my_string_allele, position_ref)

Related

How do I pivot_wider a char column?

I'm trying to pivot_wider a tibble of random alpha strings
stri_rand_strings(252, 5, '[a-z]') %>%
sort() %>%
as_tibble() %>%
mutate(id = row_number(),
col = rep(letters[1:4], each = length(value) / 4)) %>%
pivot_wider(names_from = col, values_from = value)
I get three columns of NA in a tibble (252 x 5):
# A tibble: 252 × 5
id a b c d
<int> <chr> <chr> <chr> <chr>
1 1 aarup NA NA NA
2 2 abhir NA NA NA
3 3 afpgt NA NA NA
4 4 apjts NA NA NA
5 5 arlst NA NA NA
6 6 awkjn NA NA NA
7 7 babro NA NA NA
8 8 bbrpn NA NA NA
9 9 bbrzt NA NA NA
10 10 bedzs NA NA NA
# … with 242 more rows
instead of the desired 63 x 5.
your id-column is messing everything up. rownumbers are unique, so casting to wide does not make sense, since you have got unique identifiers.
try something like
stringi::stri_rand_strings(252, 5, '[a-z]') %>%
sort() %>%
as_tibble() %>%
mutate(id = rep(1:(length(value) / 4), 4), # !! <-- !!
col = rep(letters[1:4], each = length(value) / 4)) %>%
pivot_wider(names_from = col, values_from = value)
# A tibble: 63 x 5
id a b c d
<int> <chr> <chr> <chr> <chr>
1 1 ababk glynv mottj tqcbv
2 2 abysq gmfhc mujcw twjix
3 3 aerkp godcs mycak tzqny
4 4 agtoa gpler naetp ucuvg
5 5 ahebl grqgz nfali ufbqv
6 6 amdvv gswwu nhmnu ulgup
7 7 apgut gvkwh nkcks umwih
8 8 atgxy gynef nkklm uojxc
9 9 bcklx hcdup nngfz upfhx
10 10 bcnxz hcpzy nnvpd uqlgs
# ... with 53 more rows

R split string to columns using string as column name and use any numbers as values in those columns

I have the following dataframe:
df1 = data.frame(id = 1:4, desc=c("httpmethod=put&hobbies=22.33&utiliites=50.00&home=950.00&entertainment=40.00&redirecturl=&stamp=5%0D%0A++++", "httpmethod=put&hobbies=&utiliites=&home=600.00&entertainment=25.57&redirecturl=&stamp=5%0D%0A++++", "httpmethod=put&hobbies=0.00&utiliites=&home=1127.53&entertainment=50.00&redirecturl=&stamp=5%0D%0A++++", "httpmethod=put&hobbies=&utiliites=&home=&entertainment=&redirecturl=&stamp=5%0D%0A++++"), stringsAsFactors=FALSE)
Which gives:
id
desc
1
httpmethod=put&hobbies=22.33&utiliites=50.00&home=950.00&entertainment=40.00&redirecturl=&stamp=5%0D%0A++++
2
httpmethod=put&hobbies=&utiliites=&home=600.00&entertainment=25.57&redirecturl=&stamp=5%0D%0A++++
3
httpmethod=put&hobbies=0.00&utiliites=&home=1127.53&entertainment=50.00&redirecturl=&stamp=5%0D%0A++++
4
httpmethod=put&hobbies=&utiliites=&home=&entertainment=&redirecturl=&stamp=5%0D%0A++++
I'd like:
id
hobbies
utilities
home
entertainment
1
22.33
50.00
950.00
40.00
2
NA
NA
600.00
25.57
3
0.00
NA
1127.53
50.00
4
NA
NA
NA
NA
I have looked at lots of different things but can't seem to bring it all together. The code I have at the moment is as below, but I'm thinking there must be a more simple/eloquent way (e.g. get the column names from the string).
library(dplyr)
library(tidyr)
library(stringr)
df2 <- df1 %>%
separate(desc, c("http","hob", "utl", "hom", "ent", "redirect", "stamp"), sep = "&") %>%
mutate(hobbies = str_extract(hob, "\\d+\\.*\\d*")) %>%
mutate(utilities = str_extract(utl, "\\d+\\.*\\d*")) %>%
mutate(home = str_extract(hom, "\\d+\\.*\\d*")) %>%
mutate(entertainment = str_extract(ent, "\\d+\\.*\\d*")) %>%
select(-c("http","redirect", "stamp"))
I am quite new to R so some explanation of the steps would be good. I did get to the point where I split them but ended up with a list and didn't know what to do to get the values out of the list.
Thanks
ignore warnings in this
library(tidyverse)
df1 %>%
separate_rows(desc, sep = '&') %>%
separate(desc, into = c('n', 'v'), sep = '=') %>%
pivot_wider(names_from = n, values_from = v, values_fn = as.numeric)
#> # A tibble: 4 x 8
#> id httpmethod hobbies utiliites home entertainment redirecturl stamp
#> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 NA 22.3 50 950 40 NA NA
#> 2 2 NA NA NA 600 25.6 NA NA
#> 3 3 NA 0 NA 1128. 50 NA NA
#> 4 4 NA NA NA NA NA NA NA
Created on 2021-07-26 by the reprex package (v2.0.0)
Once corrected third line hobbies0.00= as commented above,
library(dplyr)
library(tidyr)
df1 %>%
separate(col = desc, into = c("http", "hobbies", "utiliites", "home", "entertainment", "redirecturl", "stamp"), sep = "&[a-z]+[0\\.]*=") %>%
select(-http, -redirecturl, -stamp)
id hobbies utiliites home entertainment
1 1 22.33 50.00 950.00 40.00
2 2 600.00 25.57
3 3 0.00 1127.53 50.00
4 4
Update
A couple of modifications. One thanks to Shawn Brar comment, let's as.numeric all. The second one, to avoid specify the into vector (but having to remove some weird column):
df1 %>%
separate(col = desc, into = strsplit(df1$desc[1], split = "=.*?&")[[1]], sep = "&[a-z]+=") %>%
select(-httpmethod, -redirecturl, -`stamp=5%0D%0A++++`) %>%
mutate(across(everything(), as.numeric))
id hobbies utiliites home entertainment
1 1 22.33 50 950.00 40.00
2 2 NA NA 600.00 25.57
3 3 0.00 NA 1127.53 50.00
4 4 NA NA NA NA

Mutate row sum but only if NA count is 2 or less

I'm trying to mutate a new variable (sum) of 5 columns of data but only if NA count across affected columns (v2 to v6) is 2 or less otherwise return an NA. The code below sums only where there are no NA's. Help appreciated.
df <- data.frame(v1=c("A","B","C","D","E","F"), v2=c(4,NA,5,6,NA,NA), v3=c(7,8,9,NA,NA,NA),
v4=c(NA,3,5,NA,1,4), v5=c(NA,3,5,NA,1,NA), v6=c(NA,3,5,NA,1,4))
df
library(dplyr)
df = df %>%
rowwise() %>%
mutate(sum(v2, v3, v4, v5, v6))
df
In base R, we can use rowSums twice, 1st to count sum of values in each row and second to count number of NA's in R.
ifelse(rowSums(is.na(df[-1])) <= 2, rowSums(df[-1], na.rm = TRUE), NA)
#[1] NA 17 29 NA 3 NA
Using dplyr row-wise you can do this as :
library(dplyr)
df %>%
rowwise() %>%
mutate(col = ifelse(sum(is.na(c_across(v2:v6))) <= 2,
sum(c_across(v2:v6), na.rm = TRUE), NA))
# A tibble: 6 x 7
# v1 v2 v3 v4 v5 v6 col
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 A 4 7 NA NA NA NA
#2 B NA 8 3 3 3 17
#3 C 5 9 5 5 5 29
#4 D 6 NA NA NA NA NA
#5 E NA NA 1 1 1 3
#6 F NA NA 4 NA 4 NA
Shortened the code using ifelse suggestion from #rpolicastro.

Function to check the value and add values according to the result

there!
I have a table:
532 obs. of 44 variables
Its looks like this:
A tibble: 10 x 44
ID PVD Vasculitis CVA CHF MI HTN COPD
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 11 NA NA NA NA NA 1 NA
2 22 1 NA 1 NA 1 1 1
3 33 NA NA NA NA 1 1 1
4 44 NA NA 1 NA NA NA 1
5 55 1 NA NA 1 1 1 1
6 66 NA NA NA 1 1 1 1
7 77 NA NA NA NA NA NA NA
8 88 1 NA 1 1 1 1 1
9 99 NA NA NA NA NA 1 1
10 1010 NA NA NA 1 1 1 NA
# ... with 36 more variables: TB <dbl>, Diabetes <dbl>,
# Liver <dbl>, CRF <dbl>, Dementia <dbl>, Obesity <dbl>,
# Hearing_loss <dbl>, Paraplegia <dbl>, `Peptic
# _ulcer` <dbl>, Autoimmune <dbl>, Breast_Cancer <dbl>,
# Colon_Cancer <dbl>, Anus_Cancer <dbl>,
# Stomach_Cancer <dbl>, Pancreas_Cancer <dbl>,
# Ovarian_Cancer <dbl>, Cervix_uteri_Cancer <dbl>,
# Uterus_Cancer <dbl>, Prostate_Cancer <dbl>,
# Melanoma <dbl>, Lymphoma <dbl>, Leukemia <dbl>,
# Thyroid_Cancer <dbl>, Head_and_neck_Cancer <dbl>,
# Kidney_Cancer <dbl>, Adrenal_Cancer <dbl>,
# Bone_Cancer <dbl>, Testicular_Cancer <dbl>,
# Skin_Cancer <dbl>, Urinary_Cancer <dbl>,
# Liver_Cancer <dbl>, Musculoskeletal_Cancer <dbl>,
# Multiple_myeloma <dbl>, CNS_Cancer <dbl>,
# Unknown_primary_Cancer <dbl>, solid <dbl>
So in the first column are unique IDs, the subsequent columns are the names of different diseases (no repetitions). The values in rows 1 are if there's a disease and if NA there isn't, respectively.
For example, patient number 55 (5th row) have "PVD", "CHF", "MI", "HTN", "COPD" and so on.
I also created a vector with the names of cancers (these are the names of columns 19 to 43).
I want to write a function that will check this table, and if the column names match the cancers names of my vector, it will check if this row (number of patient) has a cancer in the matching column (if there is a flag 1), it adds a flag to the last column with the name "solid". At least one coincidence is enough. And so for all the patients.
For example, the same patient 55 if he has, say, a "Colon_cancer" (column 20), he should add 1 to the "solid" column, if he has some other cancer it does not matter.
I try something like this, but without success, and I'm stuck:
solid_tumor <- function(x){
x <- as.data.frame(x)
for (i in length(x)) {
if (colnames(x) %in% tumors) {
if(any(x==1)) {
x[i] <- 1
}
}
}
}
thank you.
Some other ways to look at this.
Simple function
This demonstrates the simplicity and flexibility of a simple functional approach:
func <- function(x, candidates) {
cnames <- intersect(candidates, colnames(x))
if (length(cnames)) {
+(rowSums(!is.na(subset(x, select = cnames))) > 0)
} else rep(0L, nrow(x))
}
dat$solid <- func(dat, c("CHF", "MI"))
dat
# ID PVD Vasculitis CVA CHF MI HTN COPD solid
# 1 11 NA NA NA NA NA 1 NA 0
# 2 22 1 NA 1 NA 1 1 1 1
# 3 33 NA NA NA NA 1 1 1 1
# 4 44 NA NA 1 NA NA NA 1 0
# 5 55 1 NA NA 1 1 1 1 1
# 6 66 NA NA NA 1 1 1 1 1
# 7 77 NA NA NA NA NA NA NA 0
# 8 88 1 NA 1 1 1 1 1 1
# 9 99 NA NA NA NA NA 1 1 0
# 10 1010 NA NA NA 1 1 1 NA 1
Tidy-friendly
Unfortunately, dplyr doesn't make it particularly easy to pass all columns to a function within mutate. (I'm sure somebody will recommend c_across() ... I have yet to find its use elegant in something like this.)
A first stab might use the dot .:
dat %>%
mutate(solid = func(., c("CHF", "MI")))
which works fine in the absence of any grouping, but it always uses the whole frame regardless of groups, so if that is ever a consideration, then you'll get an error:
dat %>%
group_by(ID) %>%
# I know this is equivalent to rowwise() with this data
mutate(solid = func(., c("CHF", "MI")))
# Error: Problem with `mutate()` input `solid`.
# x Input `solid` can't be recycled to size 1.
# i Input `solid` is `func(., c("CHF", "MI"))`.
# i Input `solid` must be size 1, not 10.
# i The error occured in group 1: ID = 11.
So the simple workaround is to just do it.
dat %>%
group_by(ID) %>%
do(mutate(., solid = func(., c("CHF", "MI"))))
(I realize that grouping is not relevant for this data, but applying a "simple" non-tidyverse function to all columns of the current frame can be useful elsewhere, and honoring groups is important.)
A pivoting approach, using tidyverse
It seems to me that it might be best if you pivot your data so it is easier to write functions on.
Sample Dataframe:
First lets make a sample dataframe that makes it easier for others to consider this problem and the solution in the future:
library(tidyverse)
na_or_1 <- c("NA", 1)
df <- tibble(
ID = sample(10:100, 30),
car = sample(na_or_1, 30, replace = T, prob = c(.05, .95)),
bug = sample(na_or_1, 30, replace = T, prob = c(.05, .95)),
blast = sample(na_or_1, 30, replace = T, prob = c(.05, .95)),
opt = sample(na_or_1, 30, replace = T, prob = c(.05, .95)),
star = sample(na_or_1, 30, replace = T, prob = c(.05, .95)),
queue = sample(na_or_1, 30, replace = T, prob = c(.05, .95)),
man = sample(na_or_1, 30, replace = T, prob = c(.05, .95)),
ring = sample(na_or_1, 30, replace = T, prob = c(.05, .95)),
happy = sample(na_or_1, 30, replace = T, prob = c(.05, .95)),
after = sample(na_or_1, 30, replace = T, prob = c(.05, .95)),
hug = sample(na_or_1, 30, replace = T, prob = c(.05, .95)),
dragon = sample(na_or_1, 30, replace = T, prob = c(.05, .95))
)
Pivot Longer
Now we can use the pivot_longer function to pivot all the data longer against the ID column.
df %>%
pivot_longer(cols = -ID) %>%
filter(!is.na(value))
This makes an output that is:
ID Name value where name == the name of the column (cancer names) and value == 1 (the NAs are all filtered out)
This makes it much easier for a function to apply logic to what you're asking it to do.
Mutate using case_when
Using a vector of tumors I can apply a case_when logic and mutate a new column solid. I took out the NA filter because it occurred to me that you will want your DF to be mostly unchanged.
df %>%
pivot_longer(cols = -ID) %>%
# filter( !is.na(value) ) %>%
mutate(solid = case_when(name %in% tumors &
!is.na(value) ~ 1,
T ~ NA_real_))
Pivot Wider
Finally, I will undo the longer pivot using pivot_wider. I will take the names from the name column that was originally created using the pivot longer.
df %>%
pivot_longer(cols = -ID) %>%
# filter( !is.na(value) ) %>%
mutate(solid = case_when(name %in% tumors &
!is.na(value) ~ 1,
T ~ NA_real_)) %>%
pivot_wider(
names_from = name
)
Function Solution
myfunction <- function(df, tumors){
df %>%
pivot_longer(cols = -ID) %>%
# filter( !is.na(value) ) %>%
mutate(solid = case_when(name %in% tumors &
!is.na(value) ~ 1,
T ~ NA_real_)) %>%
pivot_wider(
names_from = name
)
}
Example
Now when I take a vector of tumors I can put this into the function and the df and come with the answer to your original question.
tumors <- c("hug", "happy", "man")
myfunction(df, tumors)
# # A tibble: 32 x 14
# ID solid car bug blast opt star queue man ring happy after hug dragon
# <int> <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
# 1 16 NA NA NA NA NA NA NA NA NA NA NA NA NA
# 2 98 NA NA NA NA NA NA NA NA NA NA NA NA NA
# 3 74 NA NA NA NA NA NA NA NA NA NA NA NA NA
# 4 50 NA NA NA NA NA NA 1 NA NA NA NA NA NA
# 5 50 1 NA NA NA NA NA NA 1 NA NA NA NA NA
# 6 29 NA NA NA NA NA NA NA NA NA NA NA NA NA
# 7 94 NA NA NA NA 1 NA NA NA NA NA NA NA NA
# 8 19 NA NA 1 NA NA NA NA NA NA NA NA NA NA
# 9 46 NA NA NA NA NA NA NA NA NA NA NA NA NA
# 10 15 NA NA 1 NA NA NA NA NA NA NA NA NA NA

Separate a shopping list into multiple columns

I have a shopping list data like this:
df <- data.frame(id = 1:5, item = c("apple2milk5", "milk1", "juice3apple5", "egg10juice1", "egg8milk2"), stringsAsFactors = F)
# id item
# 1 1 apple2milk5
# 2 2 milk1
# 3 3 juice3apple5
# 4 4 egg10juice1
# 5 5 egg8milk2
I want to separate the variable item into multiple columns and record the number behind the goods. The problem I met is that the goods each person purchases are different so I cannot solve it using tidyr::separate() or other analogous functions. What I expect is:
# id apple milk juice egg
# 1 1 2 5 NA NA
# 2 2 NA 1 NA NA
# 3 3 5 NA 3 NA
# 4 4 NA NA 1 10
# 5 5 NA 2 NA 8
Note: The categories of goods in the market are unknown. So don't assume there are only 4 kinds of goods.
Thanks for any helps!
I just came up with a tidyverse solution which uses stringr::str_extract_all() to extract the quantities, sets their names as product names, and expands them to wide using tidyr::unnest_wider().
library(tidyverse)
df %>%
mutate(N = map2(str_extract_all(item, "\\d+"), str_extract_all(item, "\\D+"), set_names)) %>%
unnest_wider(N, transform = as.numeric)
# # A tibble: 5 × 6
# id item apple milk juice egg
# <int> <chr> <dbl> <dbl> <dbl> <dbl>
# 1 1 apple2milk5 2 5 NA NA
# 2 2 milk1 NA 1 NA NA
# 3 3 juice3apple5 5 NA 3 NA
# 4 4 egg10juice1 NA NA 1 10
# 5 5 egg8milk2 NA 2 NA 8
I'll add yet another answer. It only slightly differs from #ASuliman's but uses a bit of the newer tidyr and some cute regex to become a bit more straightforward.
The regex trick is that the pattern "(?<=\\d)\\B(?=[a-z])" will match the non-boundary (i.e. an empty location) between numbers and letters, allowing you to create rows for every "apple5" type of entry. Extract the letters into an item column and numbers into a count column. Using the new pivot_wider which replaces spread, you can convert those counts to numeric values as you reshape.
library(dplyr)
library(tidyr)
df %>%
separate_rows(item, sep = "(?<=\\d)\\B(?=[a-z])") %>%
extract(item, into = c("item", "count"), regex = "^([a-z]+)(\\d+)$") %>%
pivot_wider(names_from = item, values_from = count, values_fn = list(count = as.numeric))
#> # A tibble: 5 x 5
#> id apple milk juice egg
#> <int> <dbl> <dbl> <dbl> <dbl>
#> 1 1 2 5 NA NA
#> 2 2 NA 1 NA NA
#> 3 3 5 NA 3 NA
#> 4 4 NA NA 1 10
#> 5 5 NA 2 NA 8
Possibily something like this, and should work with any item/quantity.
It just assumes that the quantity follows the item.
Lets use a custom function which extracts item and quantity:
my_fun <- function(w) {
items <- stringr::str_split(w, "\\d+", simplify = T)
items <- items[items!=""] # dont now why but you get en empty spot each time
quantities <- stringr::str_split(w, "\\D+", simplify = T)
quantities <- quantities[quantities!=""]
d <- data.frame(item = items, quantity=quantities, stringsAsFactors = F)
return(d)
}
Example:
my_fun("apple2milk5")
# gives:
# item quantity
# 1 apple 2
# 2 milk 5
Now we can apply the function to each id, using nest and map:
library(dplyr)
df_result <- df %>%
nest(item) %>%
mutate(res = purrr::map(data, ~my_fun(.x))) %>%
unnest(res)
df_results
# # A tibble: 9 x 3
# id item quantity
# <int> <chr> <chr>
# 1 1 apple 2
# 2 1 milk 5
# 3 2 milk 1
# 4 3 juice 3
# 5 3 apple 5
# 6 4 egg 10
# 7 4 juice 1
# 8 5 egg 8
# 9 5 milk 2
Now we can use dcast() (probabily spread would work too):
data.table::dcast(df_result, id~item, value.var="quantity")
# id apple egg juice milk
# 1 1 2 <NA> <NA> 5
# 2 2 <NA> <NA> <NA> 1
# 3 3 5 <NA> 3 <NA>
# 4 4 <NA> 10 1 <NA>
# 5 5 <NA> 8 <NA> 2
Data:
df <- data.frame(id = 1:5, item = c("apple2milk5", "milk1", "juice3apple5", "egg10juice1", "egg8milk2"), stringsAsFactors = F)
tmp = lapply(strsplit(df$item, "(?<=\\d)(?=\\D)|(?<=\\D)(?=\\d)", perl = TRUE),
function(x) {
d = split(x, 0:1)
setNames(as.numeric(d[[2]]), d[[1]])
})
nm = unique(unlist(lapply(tmp, names)))
cbind(df, do.call(rbind, lapply(tmp, function(x) setNames(x[nm], nm))))
# id item apple milk juice egg
#1 1 apple2milk5 2 5 NA NA
#2 2 milk1 NA 1 NA NA
#3 3 juice3apple5 5 NA 3 NA
#4 4 egg10juice1 NA NA 1 10
#5 5 egg8milk2 NA 2 NA 8
Place a space before each numeric substring and a newline after it. Then read that data using read.table and unnest it. Finally use pivot_wider to convert from long to wide form.
library(dplyr)
library(tidyr)
df %>%
mutate(item = gsub("(\\d+)", " \\1\n", item)) %>%
rowwise %>%
mutate(item = list(read.table(text = item, as.is = TRUE))) %>%
ungroup %>%
unnest(item) %>%
pivot_wider(names_from = "V1", values_from = "V2")
giving:
# A tibble: 5 x 5
id apple milk juice egg
<int> <int> <int> <int> <int>
1 1 2 5 NA NA
2 2 NA 1 NA NA
3 3 5 NA 3 NA
4 4 NA NA 1 10
5 5 NA 2 NA 8
Variation
This is a variation of the above code that eliminates the unnest. We replace each numeric string by a space, that string, another space, the id and a newline. Then use read.table to read that in. Note the use of %$% rather than %>% before the read.table. Finally use pivot_wider to convert from long to wide form.
library(dplyr)
library(magrittr)
library(tidyr)
df %>%
rowwise %>%
mutate(item = gsub("(\\d+)", paste(" \\1", id, "\n"), item)) %$%
read.table(text = item, as.is = TRUE, col.names = c("nm", "no", "id")) %>%
ungroup %>%
pivot_wider(names_from = "nm", values_from = "no")
You can try
library(tidyverse)
library(stringi)
df %>%
mutate(item2 =gsub("[0-9]", " ", df$item)) %>%
mutate(item3 =gsub("[a-z]", " ", df$item)) %>%
mutate_at(vars(item2, item3), ~stringi::stri_extract_all_words(.) %>% map(paste, collapse=",")) %>%
separate_rows(item2, item3, sep = ",") %>%
spread(item2, item3)
id item apple egg juice milk
1 1 apple2milk5 2 <NA> <NA> 5
2 2 milk1 <NA> <NA> <NA> 1
3 3 juice3apple5 5 <NA> 3 <NA>
4 4 egg10juice1 <NA> 10 1 <NA>
5 5 egg8milk2 <NA> 8 <NA> 2
#replace any digit followed by a character "positive look-ahead assertion" by the digit plus a comma
library(dplyr)
library(tidyr)
df %>% mutate(item=gsub('(\\d+(?=\\D))','\\1,' ,item, perl = TRUE)) %>%
separate_rows(item, sep = ",") %>%
extract(item, into = c('prod','quan'), '(\\D+)(\\d+)') %>%
spread(prod, quan, fill=0)
id apple egg juice milk
1 1 2 0 0 5
2 2 0 0 0 1
3 3 5 0 3 0
4 4 0 10 1 0
5 5 0 8 0 2
This is a simple solution in base R and stringr:
goods <- unique(unlist(stringr::str_split(df$item, pattern = "[0-9]")))
goods <- goods[goods != ""]
df <- cbind(df$id, sapply(goods,
function(x) stringr::str_extract(df$item, pattern = paste0(x,"[0-9]*"))))
df <- as.data.frame(df)
df[-1] <- lapply(df[-1], function(x) as.numeric(stringr::str_extract(x, pattern = "[0-9]*$")))
names(df)[1] <- "id"
Output
id apple milk juice egg
1 1 2 5 NA NA
2 2 NA 1 NA NA
3 3 5 NA 3 NA
4 4 NA NA 1 10
5 5 NA 2 NA 8
Mostly base R with some input from stringr and data.table:
library(stringr)
library(data.table)
cbind(
id = df$id,
rbindlist(
lapply(df$item, function(x) as.list(setNames(str_extract_all(x, "[0-9]+")[[1]], strsplit(x, "[0-9]+")[[1]]))),
fill = TRUE
)
)
id apple milk juice egg
1: 1 2 5 <NA> <NA>
2: 2 <NA> 1 <NA> <NA>
3: 3 5 <NA> 3 <NA>
4: 4 <NA> <NA> 1 10
5: 5 <NA> 2 <NA> 8
A cleaner data.table solution with input from stringr:
df[,
.(it_count = str_extract_all(item, "[0-9]+")[[1]],
it_name = str_extract_all(item, "[^0-9]+")[[1]]),
by = id
][, dcast(.SD, id ~ it_name, value.var = "it_count")]
id apple egg juice milk
1: 1 2 <NA> <NA> 5
2: 2 <NA> <NA> <NA> 1
3: 3 5 <NA> 3 <NA>
4: 4 <NA> 10 1 <NA>
5: 5 <NA> 8 <NA> 2

Resources