remove character for all column names in a data frame - r

I have a large dataframe. I'm trying to remove v character from variable names of a data frame
df <- tibble(q_ve5 = 1:2,
q_f_1v = 3:4,
q_vf_2 = 3:4,
q_e6 = 5:6,
q_ev8 = 5:6)
I tried this. It seems my regular expression pattern is not correct
df %>%
rename_all(~ str_remove(., "\\v\\d+$"))
My desired col names:
q_e5 q_f_1 q_f_2 q_e6 q_e8

If we need to remove only 'v' the one of more digits (\\d+) at the end ($) is not needed as the expected output also removes 'v' from first column 'q_ve5'
library(dplyr)
library(stringr)
df %>%
rename_with(~ str_remove(., "v"), everything())
-output
# A tibble: 2 × 5
q_e5 q_f_1 q_f_2 q_e6 q_e8
<int> <int> <int> <int> <int>
1 1 3 3 5 5
2 2 4 4 6 6
Or without any packages
names(df) <- sub("v", "", names(df))

Related

Separate rows with conditions

I have this dataframe separate_on_condition with two columns:
separate_on_condition <- data.frame(first = 'a3,b1,c2', second = '1,2,3,4,5,6')`
# first second
# 1 a3,b1,c2 1,2,3,4,5,6
How can I turn it to:
# A tibble: 6 x 2
first second
<chr> <chr>
1 a 1
2 a 2
3 a 3
4 b 4
5 c 5
6 c 6
where:
a3 will be separated into 3 rows
b1 into 1 row
c2 into 2 rows
Is there a better way on achieving this instead of using rep() on first column and separate_rows() on the second column?
Any help would be much appreciated!
Create a row number column to account for multiple rows.
Split second column on , in separate rows.
For each row extract the data to be repeated along with number of times it needs to be repeated.
library(dplyr)
library(tidyr)
library(stringr)
separate_on_condition %>%
mutate(row = row_number()) %>%
separate_rows(second, sep = ',') %>%
group_by(row) %>%
mutate(first = rep(str_extract_all(first(first), '[a-zA-Z]+')[[1]],
str_extract_all(first(first), '\\d+')[[1]])) %>%
ungroup %>%
select(-row)
# first second
# <chr> <chr>
#1 a 1
#2 a 2
#3 a 3
#4 b 4
#5 c 5
#6 c 6
You can the following base R option
with(
separate_on_condition,
data.frame(
first = unlist(sapply(
unlist(strsplit(first, ",")),
function(x) rep(gsub("\\d", "", x), as.numeric(gsub("\\D", "", x)))
), use.names = FALSE),
second = eval(str2lang(sprintf("c(%s)", second)))
)
)
which gives
first second
1 a 1
2 a 2
3 a 3
4 b 4
5 c 5
6 c 6
Here is an alternative approach:
add NA to first to get same length
use separate_rows to bring each element to a row
use extract by regex digit to split first into first and helper
group and slice by values in helper
do some tweaking
library(tidyr)
library(dplyr)
separate_on_condition %>%
mutate(first = str_c(first, ",NA,NA,NA")) %>%
separate_rows(first, second, sep = "[^[:alnum:].]+", convert = TRUE) %>%
extract(first, into = c("first", "helper"), "(.{1})(.{1})", remove=FALSE) %>%
group_by(second) %>%
slice(rep(1:n(), each = helper)) %>%
ungroup() %>%
drop_na() %>%
mutate(second = row_number()) %>%
select(first, second)
first second
<chr> <int>
1 a 1
2 a 2
3 a 3
4 b 4
5 c 5
6 c 6

Tidyeval: apply function to data frames extracted from list

This is a simplified version of a problem involving a large list containing complex tables. I want to extract the tables from the list and apply a function to each one. Here we can create a simple list containing small named data frames:
library(tidyverse)
table_names <- c('dfA', 'dfB', 'dfC')
dfA <- tibble(a = 1:3, b = 4:6, c = 7:9)
dfB <- tibble(a = 10:12, b = 13:15, c = 16:18)
dfC <- tibble(a = 19:21, b = 22:24, c = 25:27)
df_list <- list(dfA, dfB, dfC) %>% setNames(table_names)
Here is a simplified example of the kind of operation I would like to apply:
dfA_mod <- df_list$dfA %>%
mutate(name = 'dfA') %>%
select(name, everything())
In this example, I extract one of three tables in the list df_list$dfA, create a new column with the same value in each row mutate(name = 'dfA'), and re-order the columns so that the new column appears in the left-most position select(name, everything()). The resulting object is assigned to dfA_mod.
To solve the larger problem, I want to use one of the purrr::map() variants to apply the function over the character vector table_names, which was initiated in the first block of code above. The elements of table_names serve two purposes: 1) naming the tables held in the list; and 2) supplying values for the name column in the modified table.
I could write a function such as:
fun <- function(x) {
df_list$x %>%
mutate(name = x) %>%
select(name, everything()) %>%
assign(paste0(x, '_mod'), ., envir = .GlobalEnv)
}
And then use map() to create a new list of modified tables:
new_list <- df_list %>% map(table_name, fun(x))
But of course this code does not work, with the main obstacle being (for me at least) figuring out how to quote and unquote the right terms within the function. I'm a beginner at tidy evaluation, and I could use some help in specifying the function and using map properly.
Here is the desired output (for one modified table):
# A tibble: 3 x 4
name a b c
<chr> <int> <int> <int>
1 dfA 1 4 7
2 dfA 2 5 8
3 dfA 3 6 9
Thanks in advance for any help!
We can use purrr::imap which passes data in the list as well as name of the list
library(dplyr)
library(purrr)
df_out <- imap(df_list, ~.x %>% mutate(name = .y) %>% select(name, everything()))
df_out
#$dfA
# A tibble: 3 x 4
# name a b c
# <chr> <int> <int> <int>
#1 dfA 1 4 7
#2 dfA 2 5 8
#3 dfA 3 6 9
#$dfB
# A tibble: 3 x 4
# name a b c
# <chr> <int> <int> <int>
#1 dfB 10 13 16
#....
#....
This gives a list of desired dataframes, if you want them as separate dataframes, you can do
names(df_out) <- paste0(names(df_out), "_mod")
list2env(df_out, .GlobalEnv)
We can also do it using base R Map
df_out <- Map(function(x, y) transform(x, name = y)[c('name', names(x))],
df_list, names(df_list))
and give list names same as above.
We can convert it to a single data.frame with map while passing the .id
library(purrr)
map_dfr(df_list, I, .id = 'name')
Or with bind_rows
library(dplyr)
bind_rows(df_list, .id = 'name')
# A tibble: 9 x 4
# name a b c
# <chr> <int> <int> <int>
#1 dfA 1 4 7
#2 dfA 2 5 8
#3 dfA 3 6 9
#4 dfB 10 13 16
#5 dfB 11 14 17
#6 dfB 12 15 18
#7 dfC 19 22 25
#8 dfC 20 23 26
#9 dfC 21 24 27

How to extract elements from nested list in R

I'm working with json data which I've converted into a tibble with some list columns. I'm trying to extract the useful information from the list columns but am facing issues. If given the following dataset-
mydf <-tibble(
x = c(1, 2, 3),
y = list(list(list(id="id1", title="title1"), list(id="id11", title="title11")),
list(id="id2",title="title2"),
NULL)
)
How can I convert it into the following-
data.frame(x=c(1:3), id = c("id1;id11", "id2", ""), title = c("title1;title11", "title2", ""))
# x id title
#1 1 id1;id11 title1;title11
#2 2 id2 title2
#3 3
Any help is appreciated. Thanks!
I think there are better ways, but this is what I can do for now. For each row, I extracted strings and concatenated them with toString(). Since unnest() creates multiple rows for each row (i.e., 1, 2, and 3 in x), I used summarize() to temporarily combine strings. Then, I separate them using separate().
mydf %>%
unnest(y, keep_empty = TRUE) %>%
rowwise %>%
mutate(y = toString(unlist(y))) %>%
group_by(x) %>%
summarize(string = paste(y, collapse = "_")) %>%
separate(col = string, into = c("id", "title"), sep = "_")
# x id title
# <dbl> <chr> <chr>
#1 1 id1, title1 id11, title11
#2 2 id2 title2
#3 3 "" NA
If the names are consistent as in the example, you can do:
mydf2 <- unlist(mydf)
x <- mydf2[grepl("x", names(mydf2))]
id <- mydf2[grepl("id", names(mydf2))]
title <- mydf2[grepl("title", names(mydf2))]
tibble(x, id, title)
# A tibble: 3 x 3
x id title
<chr> <chr> <chr>
1 1 id1 title1
2 2 id11 title11
3 3 id2 title2

R: How to extract a list from a dataframe?

Consider this simple example
> weird_df <- data_frame(col1 =c('hello', 'world', 'again'),
+ col_weird = list(list(12,23), list(23,24), NA))
>
> weird_df
# A tibble: 3 x 2
col1 col_weird
<chr> <list>
1 hello <list [2]>
2 world <list [2]>
3 again <lgl [1]>
I need to extract the values in the col_weird. How can I do that? I see how to do that in Python but not in R. Expected output is:
> good_df
# A tibble: 3 x 3
col1 tic toc
<chr> <dbl> <dbl>
1 hello 12 23
2 world 23 24
3 again NA NA
If you collapse the list column into a string you can use separate from tidyr. I used map from purrr to loop through the list column and create a string with toString.
library(tidyr)
library(purrr)
weird_df %>%
mutate(col_weird = map(col_weird, toString ) ) %>%
separate(col_weird, into = c("tic", "toc"), convert = TRUE)
# A tibble: 3 x 3
col1 tic toc
* <chr> <int> <int>
1 hello 12 23
2 world 23 24
3 again NA NA
You can actually use separate directly without the toString part but you end up with "list" as one of the values.
weird_df %>%
separate(col_weird, into = c("list", "tic", "toc"), convert = TRUE) %>%
select(-list)
This led me to tidyr::extract, which works fine with the right regular expression. If your list column was more complicated, though, writing out the regular expression might be a pain.
weird_df %>%
extract(col_weird, into = c("tic", "toc"), regex = "([[:digit:]]+), ([[:digit:]]+)", convert = TRUE)
You can do this with basic R, thanks to I():
weird_df <- data.frame(col1 =c('hello', 'world'),
col_weird = I(list(list(12,23),list(23,24))))
weird_df
> col1 col_weird
1 hello 12, 23
2 world 23, 24
weird_df <- data_frame(col1 = c('hello', 'world'),
col_weird = list(list(12,23), list(23,24)))
library(dplyr)
weird_df %>%
dplyr::mutate(tic = unlist(magrittr::extract2(col_weird, 1)),
toc = unlist(magrittr::extract2(col_weird, 2)),
col_weird = NULL)
With the last changes: Note that now col_weird contains list(NA, NA)
weird_df <- data_frame(col1 = c('hello', 'world', 'again'),
col_weird = list(list(12,23), list(23,24), list(NA, NA)))
library(dplyr)
weird_df %>%
dplyr::mutate(col_weird = matrix(col_weird),
tic = sapply(col_weird, function(x) magrittr::extract2(x, 1)),
toc = sapply(col_weird, function(x) magrittr::extract2(x, 2)),
col_weird = NULL)
Here is one option to do with purrr/tidyverse/reshape2. We unlist the 'col_weird' within map to get the output as list, set the names of the list with 'col1', melt to 'long' format, grouped by 'L1', create a 'rn' column and spread it back to 'wide'
library(tidyverse)
library(reshape2)
weird_df$col_weird %>%
map(unlist) %>%
setNames(., weird_df$col1) %>%
melt %>%
group_by(L1) %>%
mutate(rn = c('tic', 'toc')[row_number()]) %>%
spread(rn, value) %>%
left_join(weird_df[-2], ., by = c(col1 = "L1"))
well, I came up with a simple one
> weird_df %>%
+ rowwise() %>%
+ mutate(tic = col_weird[[1]],
+ tac = ifelse(length(col_weird) == 2, col_weird[[2]], NA)) %>%
+ select(-col_weird) %>% ungroup()
# A tibble: 3 x 3
col1 tic tac
<chr> <dbl> <dbl>
1 hello 12 23
2 world 23 24
3 again NA NA

R - Identify a sequence of row elements by groups in a dataframe

Consider the following sample dataframe:
> df
id name time
1 1 b 10
2 1 b 12
3 1 a 0
4 2 a 5
5 2 b 11
6 2 a 9
7 2 b 7
8 1 a 15
9 2 b 1
10 1 a 3
df = structure(list(id = c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 1L),
name = c("b", "b", "a", "a", "b", "a", "b", "a", "b", "a"
), time = c(10L, 12L, 0L, 5L, 11L, 9L, 7L, 15L, 1L, 3L)), .Names = c("id",
"name", "time"), row.names = c(NA, -10L), class = "data.frame")
I need to identify and record all sequences seq <- c("a","b"), where "a" precedes "b" based on "time" column, for each id. No other names between "a" and "b" are permitted. Real sequence length is at least 5.
The expected result for the sample data is
a b
1 3 10
2 5 7
3 9 11
There is a similar question Finding rows in R dataframe where a column value follows a sequence. However, it is not clear to me how to deal with "id" column in my case. Is it a way to solve the problem using "dplyr"?
library(dplyr); library(tidyr)
# sort data frame by id and time
df %>% arrange(id, time) %>% group_by(id) %>%
# get logical vector indicating rows of a followed by b and mark each pair as unique
# by cumsum
mutate(ab = name == "a" & lead(name) == "b", g = cumsum(ab)) %>%
# subset rows where conditions are met
filter(ab | lag(ab)) %>%
# reshape your data frame to wide format
select(-ab) %>% spread(name, time)
#Source: local data frame [3 x 4]
#Groups: id [2]
# id g a b
#* <int> <int> <int> <int>
#1 1 1 3 10
#2 2 1 5 7
#3 2 2 9 11
If length of the sequence is larger than two, then you will need to check multiple lags, and one option of this is to use shift function(which accepts a vector as lag/lead steps) from data.table combined with Reduce, say if we need to check pattern abb:
library(dplyr); library(tidyr); library(data.table)
pattern = c("a", "b", "b")
len_pattern = length(pattern)
df %>% arrange(id, time) %>% group_by(id) %>%
# same logic as before but use Reduce function to check multiple lags condition
mutate(ab = Reduce("&", Map("==", shift(name, n = 0:(len_pattern - 1), type = "lead"), pattern)),
g = cumsum(ab)) %>%
# use reduce or to subset sequence rows having the same length as the pattern
filter(Reduce("|", shift(ab, n = 0:(len_pattern - 1), type = "lag"))) %>%
# make unique names
group_by(g, add = TRUE) %>% mutate(name = paste(name, 1:n(), sep = "_")) %>%
# pivoting the table to wide format
select(-ab) %>% spread(name, time)
#Source: local data frame [1 x 5]
#Groups: id, g [1]
# id g a_1 b_2 b_3
#* <int> <int> <int> <int> <int>
#1 1 1 3 10 12
It's somewhat convoluted, but how about a rolling join?
library(data.table)
setorder(setDT(df), id, time)
df[ name == "b" ][
df[, if(name == "a") .(time = last(time)), by=.(id, name, r = rleid(id,name))],
on = .(id, time),
roll = -Inf,
nomatch = 0,
.(a = i.time, b = x.time)
]
a b
1: 3 10
2: 5 7
3: 9 11
You can use an ifelse in filter with lag and lead, and then tidyr::spread to reshape to wide:
library(tidyverse)
df %>% arrange(id, time) %>% group_by(id) %>%
filter(ifelse(name == 'b', # if name is b...
lag(name) == 'a', # is the previous name a?
lead(name) == 'b')) %>% # else if name is not b, is next name b?
ungroup() %>% mutate(i = rep(seq(n() / 2), each = 2)) %>% # create indices to spread by
spread(name, time) %>% select(a, b) # spread to wide and clean up
## # A tibble: 3 × 2
## a b
## * <int> <int>
## 1 3 10
## 2 5 7
## 3 9 11
Based on the comment below, here's a version that uses gregexpr to find the first index of a matched pattern, which while more complicated, scales more easily to longer patterns like "aabb":
df %>% group_by(pattern = 'aabb', id) %>% # add pattern as column, group
arrange(time) %>%
# collapse each group to a string for name and a list column for time
summarise(name = paste(name, collapse = ''), time = list(time)) %>%
# group and add list-column of start indices for each match
rowwise() %>% mutate(i = gregexpr(pattern, name)) %>%
unnest(i, .drop = FALSE) %>% # expand, keeping other list columns
filter(i != -1) %>% # chop out rows with no match from gregexpr
rowwise() %>% # regroup
# subset with sequence from index through pattern length
mutate(time = list(time[i + 0:(nchar(pattern) - 1)]),
pattern = strsplit(pattern, '')) %>% # expand pattern to list column
rownames_to_column('match') %>% # add rownames as match index column
unnest(pattern, time) %>% # expand matches in parallel
# paste sequence onto each letter (important for spreading if repeated letters)
group_by(match) %>% mutate(pattern = paste0(pattern, seq(n()))) %>%
spread(pattern, time) # spread to wide form
## Source: local data frame [1 x 8]
## Groups: match [1]
##
## match id name i a1 a2 b3 b4
## * <chr> <int> <chr> <int> <int> <int> <int> <int>
## 1 1 1 aabba 1 0 3 10 12
Note that if the pattern doesn't happen to be in alphabetical order, the resulting columns will not be ordered by their indices. Since indices are preserved, though, you can sort with something like select(1:4, parse_number(names(.)[-1:-4]) + 4).

Resources