Given a column of a dataframe like:
df <- structure(list(ingredients = structure(c("en:xylitol", "en:water,en:sugar,en:glucose-fructose-syrup,en:fructose,en:glucose,fr:dioxyde-de-carbone,en:acid,en:citric-acid,en:natural-flavouring,en:flavouring,fr:arome-quinine,fr:quinine",
"en:sugar,en:corn-syrup,fr:sirop-de-mais-a-teneur-elevee-en-fructose,en:citric-acid,en:natural-and-artificial-flavouring,en:artificial-flavouring,en:natural-flavouring,en:flavouring,en:colour,fr:rouge-40,fr:bleu-1",
"pt:semoule-de-ble-dur,pt:pesto,pt:basilic,pt:fromage-en-poudre,pt:ail-et-epinars,pt:basilic-contient-du-gluten-et-des-derives-de-lait",
"pt:pimenta-branca", "en:water,es:pasta-de-almendras-tostadas,en:sugar",
"en:water,es:zumo-de-chufas,en:sugar,en:dextrose,en:glucose,es:estabilizantes,es:412,es:carragenanos,es:e-407,es:carboximetil-celulosa,es:e-466,es:monodigliceridos-de-acidos-grasos,en:mono-and-diglycerides-of-fatty-acids,en:emulsifier,en:flavouring,en:guar-gum,es:e",
"es:aceitunas-cacerena,en:water,en:salt,en:stabiliser,es:579,es:categoria,es:i,es:calibre,es:gluconato-ferroso,es:e,es:240,es:260",
"en:carbonated-water,en:water,en:sugar,en:colour,fr:caramel-e150d,en:natural-flavouring,en:flavouring,en:acid,en:phosphoric-acid,fr:extrait-de-genepi,fr:cafeine",
"en:pear,en:fruit,es:variedad,es:70-mm,es:conferencia,es:categoria-i,es:calibre,es:65"
), .Dim = c(10L, 1L))), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame"))
I want to split each row by "," separator and put the separated output in a group.
For example:
en:water,es:pasta-de-almendras-tostadas,en:sugar
will turn into
group ingredient
1 en:water
1 es:pasta-de-almendras-tostadas
1 en:sugar
The next thing is to delete the prefix of XX:
Please advise how to do this?
We can use separate_rows from tidyr after adding a sequence column
library(tidyr)
library(dplyr)
df %>%
mutate(group = row_number()) %>%
separate_rows(ingredients, sep=",")
You could use strsplit and add a group column according to list number, finally rbind the thing.
l <- strsplit(df$ingredients, ",")
res <- data.frame(do.call(rbind, sapply(seq(l), function(x)
cbind(group=x, ingredient=l[[x]]))))
head(res)
# group ingredient
# 1 1 en:xylitol
# 2 2 en:water
# 3 2 en:sugar
# 4 2 en:glucose-fructose-syrup
# 5 2 en:fructose
# 6 2 en:glucose
Here is an answer including the removal of the prefix:
library(tidyverse)
df %>%
mutate(ingredients = str_split(ingredients, ","),
row_num = row_number()) %>%
unnest() %>%
mutate(ingredients = str_remove(ingredients, "^[^:]+(:)"))
#> # A tibble: 82 x 2
#> row_num ingredients
#> <int> <chr>
#> 1 1 xylitol
#> 2 2 water
#> 3 2 sugar
#> 4 2 glucose-fructose-syrup
#> 5 2 fructose
#> 6 2 glucose
#> 7 2 dioxyde-de-carbone
#> 8 2 acid
#> 9 2 citric-acid
#> 10 2 natural-flavouring
#> # … with 72 more rows
We can use data.table:
require(data.table)
setDT(df)[, lapply(.SD, function(x){
unlist(tstrsplit(x, ",", fixed=TRUE))}),
by = seq.int(nrow(df))]
or just using base:
stack(setNames(strsplit(df$ingredients,','), seq.int(nrow(df))))
or using splitstackshape package:
require(splitstackshape)
cSplit(cbind(seq.int(nrow(df)), df), "ingredients", ",", "long")
#> seq.int(nrow(df)) ingredients
#> 1: 1 en:xylitol
#> 2: 2 en:water
#> 3: 2 en:sugar
#> 4: 2 en:glucose-fructose-syrup
#> 5: 2 en:fructose
#> 6: 2 en:glucose
#...# … with 76 more rows (manually trimmed the output)
Using #akrun's suggestion of separate_rows function I have just used the following code with the regex ^[a-z]{2}: that means that I want to filter at the beginning of each value the first 2 characters a-z and a column (:).
df %>%
dplyr::mutate(group = row_number()) %>%
tidyr::separate_rows(ingredients, sep = ",", convert = TRUE) %>%
dplyr::mutate(ingredients = str_remove(ingredients, pattern = "^[a-z]{2}:")) %>%
dplyr::distinct(ingredients, .keep_all = TRUE)
Related
This question already has answers here:
Extracting numbers from vectors of strings
(12 answers)
Closed 10 months ago.
I have a dataframe exported from the web with this format
id vals
1 {7,12,58,1}
2 {1,2,5,7}
3 {15,12}
I would like to extract ONLY the numbers (ignore curlys and commas) into multiple columns like this
id val_1 val_2 val_3 val_4 val_5
1 7 12 58 1
2 1 2 5 7
3 15 12
Even though the Max of values we got was 4 I want to always go up to value val_5.
Thanks!
We could use str_extract_all for this:
library(dplyr)
library(stringr)
df %>%
mutate(vals = str_extract_all(vals, '\\d+', ''))
or as #akrun suggest in the comments
df %>%
mutate(vals = str_extract_all(vals, '\\d+', '')) %>%
do.call(data.frame, .)
id vals.1 vals.2 vals.3 vals.4
1 1 7 12 58 1
2 2 1 2 5 7
3 3 15 12 <NA> <NA>
data:
df <- structure(list(id = 1:3, vals = c("{7,12,58,1}", "{1,2,5,7}",
"{15,12}")), class = "data.frame", row.names = c(NA, -3L))
Another possible tidyverse option, where we remove the curly brackets, then separate the rows on the ,, then pivot to wide form. Then, we can create the additional column (using add_column from tibble) based on the max value in the column names (which is 4 in this case), and then can create val_5.
library(tidyverse)
df %>%
mutate(vals = str_replace_all(vals, "\\{|\\}", "")) %>%
separate_rows(vals, sep=",") %>%
group_by(id) %>%
mutate(ind = row_number()) %>%
pivot_wider(names_from = ind, values_from = vals, names_prefix = "val_") %>%
add_column(!!(paste0("val_", parse_number(names(.)[ncol(.)])+1)) := NA)
Output
id val_1 val_2 val_3 val_4 val_5
1 1 7 12 58 1 NA
2 2 1 2 5 7 NA
3 3 15 12 <NA> <NA> NA
Data
df <- read.table(text = "id vals
1 {7,12,58,1}
2 {1,2,5,7}
3 {15,12} ", header = T)
Using data.table
library(data.table)
library(stringi)
result <- setDT(df)[, stri_match_all_regex(vals, '\\d+')[[1]], by=.(id)]
result[, item:=paste('val', 1:.N, sep='_'), by=.(id)] # defines column names
dcast(result, id~item, value.var = 'V1') # convert from long to wide
## id val_1 val_2 val_3 val_4
## 1: 1 7 12 58 1
## 2: 2 1 2 5 7
## 3: 3 15 12 <NA> <NA>
I have a table of data which includes, among others, an ID, a (somehow sorted) grouping column and a date. For each ID, based on the minimum value of the date for a given group, I would like to filter out the rows of another given group that occurred after that date.
I thought about using pivot_wider and pivot_longer, but I was not able to operate on columns containing list values and single values simultaneously.
How can I do it efficiently (using any tidyverse method, if possible)?
For instance, given
library(dplyr)
tbl <- tibble(id = c(rep(1,5), rep(2,5)),
type = c("A","A","A","B","C","A","A","B","B","C"),
dat = as.Date("2021-12-07") - c(3,0,1,2,0,3,6,2,4,3))
# A tibble: 10 × 3
# id type dat
# <int> <chr> <date>
# 1 1 A 2021-12-04
# 2 1 A 2021-12-07
# 3 1 A 2021-12-06
# 4 1 B 2021-12-05
# 5 1 C 2021-12-07
# 6 2 A 2021-12-04
# 7 2 A 2021-12-01
# 8 2 B 2021-12-05
# 9 2 B 2021-12-03
# 10 2 C 2021-12-04
I would like the following result, where I discarded A-typed elements that occurred after the first of the B-typed ones, but none of the C-typed ones:
# A tibble: 7 × 3
# id type dat
# <int> <chr> <date>
# 1 1 A 2021-12-04
# 2 1 B 2021-12-05
# 3 1 C 2021-12-07
# 4 2 A 2021-12-01
# 5 2 B 2021-12-05
# 6 2 B 2021-12-03
# 7 2 C 2021-12-04
I like to use pivot_wider aand pivot_longer in this case. It does the trick, but maybe you are looking for something shorter.
tbl <- tibble(id = 1:5, type = c("A","A","A","B","C"), dat = as.Date("2021-12-07") - c(3,4,1,2,0)) %>%
pivot_wider(names_from = type, values_from = dat) %>%
filter(A < min(B, na.rm = TRUE) | is.na(A)) %>%
pivot_longer(2:4, names_to = "type", values_to = "dat") %>%
na.omit()
# A tibble: 4 × 3
id type dat
<int> <chr> <date>
1 1 A 2021-12-04
2 2 A 2021-12-03
3 4 B 2021-12-05
4 5 C 2021-12-07
An easy way using kind of SQL logic :
tbl_to_delete <- tbl %>% dplyr::filter(type == "A" & dat > min(tbl$dat[tbl$type=="B"]))
tbl2 <- tbl %>% dplyr::anti_join(tbl_to_delete,by=c("type","dat"))
First you isolate the rows you want to delete, then you discard them from your original data.
You can of course merge the two lines before into one for better code management :
tbl %>% anti_join(tbl %>% filter(type == "A" & dat > min(tbl$dat[tbl$type=="B"])),by=c("type","dat"))
Or if you really hate rbase :
tbl %>% anti_join(tbl %>% filter(type == "A" & dat > tbl %>% filter(type == "B") %>% pull(dat) %>% min()),by=c("type","dat"))
I have this dataframe separate_on_condition with two columns:
separate_on_condition <- data.frame(first = 'a3,b1,c2', second = '1,2,3,4,5,6')`
# first second
# 1 a3,b1,c2 1,2,3,4,5,6
How can I turn it to:
# A tibble: 6 x 2
first second
<chr> <chr>
1 a 1
2 a 2
3 a 3
4 b 4
5 c 5
6 c 6
where:
a3 will be separated into 3 rows
b1 into 1 row
c2 into 2 rows
Is there a better way on achieving this instead of using rep() on first column and separate_rows() on the second column?
Any help would be much appreciated!
Create a row number column to account for multiple rows.
Split second column on , in separate rows.
For each row extract the data to be repeated along with number of times it needs to be repeated.
library(dplyr)
library(tidyr)
library(stringr)
separate_on_condition %>%
mutate(row = row_number()) %>%
separate_rows(second, sep = ',') %>%
group_by(row) %>%
mutate(first = rep(str_extract_all(first(first), '[a-zA-Z]+')[[1]],
str_extract_all(first(first), '\\d+')[[1]])) %>%
ungroup %>%
select(-row)
# first second
# <chr> <chr>
#1 a 1
#2 a 2
#3 a 3
#4 b 4
#5 c 5
#6 c 6
You can the following base R option
with(
separate_on_condition,
data.frame(
first = unlist(sapply(
unlist(strsplit(first, ",")),
function(x) rep(gsub("\\d", "", x), as.numeric(gsub("\\D", "", x)))
), use.names = FALSE),
second = eval(str2lang(sprintf("c(%s)", second)))
)
)
which gives
first second
1 a 1
2 a 2
3 a 3
4 b 4
5 c 5
6 c 6
Here is an alternative approach:
add NA to first to get same length
use separate_rows to bring each element to a row
use extract by regex digit to split first into first and helper
group and slice by values in helper
do some tweaking
library(tidyr)
library(dplyr)
separate_on_condition %>%
mutate(first = str_c(first, ",NA,NA,NA")) %>%
separate_rows(first, second, sep = "[^[:alnum:].]+", convert = TRUE) %>%
extract(first, into = c("first", "helper"), "(.{1})(.{1})", remove=FALSE) %>%
group_by(second) %>%
slice(rep(1:n(), each = helper)) %>%
ungroup() %>%
drop_na() %>%
mutate(second = row_number()) %>%
select(first, second)
first second
<chr> <int>
1 a 1
2 a 2
3 a 3
4 b 4
5 c 5
6 c 6
Lets say I have the dataframe:
z = data.frame(col_1 = c(1,2,3,4), col_2 = c(3,4,5,6))
col_1 col_2
1 1 3
2 2 4
3 3 5
4 4 6
I want to take columns with the same name that only differ by the number e.g. '_1' and '_2' and take the pairwise mean. In reality I have a big dataframe with many pairs and they are not in a nice order, therefore looking for a clever solution that can be applied to this.
So the output should look like this:
col
1 2
2 3
3 4
4 5
With the column name given as the same as the column pair but with the additional label removed.
Any help would be great thanks.
Here is a base R option using list2DF + split.default + rowMeans
list2DF(lapply(split.default(z,gsub("_\\d+","",names(z))),rowMeans))
which gives
col
1 2
2 3
3 4
4 5
Try this tidyverse approach. By using separate() you can extract the name and then with reshaping you can reach the desired output. Here the code:
library(dplyr)
library(tidyr)
#Data
z = data.frame(col_1 = c(1,2,3,4), col_2 = c(3,4,5,6))
#Code
z1 <- z %>% mutate(id=1:n()) %>%
pivot_longer(-id) %>%
separate(name,c('var1','var2'),sep='_') %>%
group_by(id,var1) %>% summarise(Mean=mean(value)) %>%
pivot_wider(names_from = var1,values_from=Mean) %>% ungroup() %>% select(-id)
Output:
# A tibble: 4 x 1
col
<dbl>
1 2
2 3
3 4
4 5
Here is a purrr oriented solution:
library(purrr)
library(stringr)
split.default(z, str_remove(names(z), "[:digit:]+$")) %>% map_dfc(rowMeans)
#> # A tibble: 4 x 1
#> col_
#> <dbl>
#> 1 2
#> 2 3
#> 3 4
#> 4 5
It works even if z is:
z <- data.frame(col_1 = c(1,2,3,4),
col_2 = c(3,4,5,6),
anothercol_1 = c(1,2,3,4),
anothercol_2 = c(3,4,5,6))
I'd like to recode the values in the df1 data frame using the df2 data frame so that I end up with a data frame like df3.
The current code almost does the trick, but there are two problems. First, it introduces NA when there's no match, e.g. there is no match in df2 for the df1 aed_bloodpr variable value "1,2" so the value becomes NA. Second, when a variable in df1 can't be mapped to df2, the code won't run (error message).
Have looked into the nomatch argument for match() and the .default argument for Map(), but I can't figure out how to use them so that I end up with df3.
Starting point:
Df1 <- data.frame("aed_bloodpr" = c("1,2","2","1","1"),
"aed_gluco" = c("2","1","3","2"),
"add_bmi" = c("2","5,7","7","5"),
"add_asthma" = c("2","2","7","5"),
"nausea" = c("3","3","4","5"))
Df2 <- data.frame("NameOfVariable" = c("aed_bloodpr","aed_bloodpr","aed_gluco","aed_gluco","aed_gluco","add_bmi","add_bmi","add_bmi"),
"VariableLevel" = c(1,2,1,2,3,2,5,7),
"VariableDef" = c("high","normal","elevated","normal","NA","above","normal","below"))
End point:
Df3 <- data.frame("aed_bloodpr" = c("1,2","normal","high","high"),
"aed_gluco" = c("normal","elevated","NA","normal"),
"add_bmi" = c("above","5,7","below","normal"),
"add_asthma"=c("2","2","7","5"),
"nausea" = c("3","3","4","5"))
Current code:
data.frame(Map(function(x, y) y[[2]][match(x, y[[1]])],
Df1,
split(Df2[2:3], Df2[1])[names(Df1)]))
You need to clean up before you can relabel. The actual relabeling is more easily accomplished by a join. Here using the tidyverse (translate as you like):
library(tidyverse)
Df1 <- data.frame("aed_bloodpr" = c("1,2","2","1","1"),
"aed_gluco" = c("2","1","3","2"),
"add_bmi" = c("2","5,7","7","5"),
"add_asthma" = c("2","2","7","5"),
"nausea" = c("3","3","4","5"))
Df2 <- data.frame("NameOfVariable" = c("aed_bloodpr","aed_bloodpr","aed_gluco","aed_gluco","aed_gluco","add_bmi","add_bmi","add_bmi"),
"VariableLevel" = c(1,2,1,2,3,2,5,7),
"VariableDef" = c("high","normal","elevated","normal","NA","above","normal","below"))
Df1_long <- Df1 %>%
mutate_all(as.character) %>% # change factors to strings
rowid_to_column('i') %>% # add row index to enable later long-to-wide reshape
gather(variable, value, -i) %>% # reshape to long form
separate_rows(value, convert = TRUE) # unnest nested values and convert to numeric
str(Df1_long)
#> 'data.frame': 22 obs. of 3 variables:
#> $ i : int 1 1 2 3 4 1 2 3 4 1 ...
#> $ variable: chr "aed_bloodpr" "aed_bloodpr" "aed_bloodpr" "aed_bloodpr" ...
#> $ value : int 1 2 2 1 1 2 1 3 2 2 ...
Df2_clean <- Df2 %>%
mutate_if(is.factor, as.character) %>% # change factors to strings
mutate_all(na_if, 'NA') # change "NA" to NA
Df3 <- Df1_long %>%
left_join(Df2_clean, by = c('variable' = 'NameOfVariable', # merge
'value' = 'VariableLevel')) %>%
mutate(VariableDef = coalesce(VariableDef, as.character(value))) %>% # combine labels and values
group_by(i, variable) %>%
summarise(value = toString(VariableDef)) %>% # re-aggregate multiple values
spread(variable, value) # reshape to wide form
Df3
#> # A tibble: 4 x 6
#> # Groups: i [4]
#> i add_asthma add_bmi aed_bloodpr aed_gluco nausea
#> * <int> <chr> <chr> <chr> <chr> <chr>
#> 1 1 2 above high, normal normal 3
#> 2 2 2 normal, below normal elevated 3
#> 3 3 7 below high 3 4
#> 4 4 5 normal high normal 5