Order column names in ascending order within dplyr chain - r

I have this data.frame:
df <- structure(list(att_number = structure(1:3, .Label = c("0", "1",
"2"), class = "factor"), `1` = structure(c(2L, 3L, 1L), .Label = c("1026891",
"412419", "424869"), class = "factor"), `10` = structure(c(2L,
1L, 3L), .Label = c("235067", "546686", "92324"), class = "factor"),
`2` = structure(c(3L, 1L, 2L), .Label = c("12729", "7569",
"9149"), class = "factor")), .Names = c("att_number", "1",
"10", "2"), row.names = c(NA, -3L), class = "data.frame")
It looks like this having numbers as the column names.
att_number 1 10 2
0 412419 546686 9149
1 424869 235067 12729
2 1026891 92324 7569
Within a dplyr chain, I would like to order the columns in ascending order, like this:
att_number 1 2 10
0 412419 9149 546686
1 424869 12729 235067
2 1026891 7569 7569
I've tried using select_, but it doesn't want to work according to plan. Any idea on how I can do this? Here's my feeble attempt:
names_order <- names(df)[-1] %>%
as.numeric %>%
.[order(.)] %>%
as.character %>%
c('att_number', .)
df %>%
select_(.dots = names_order)
Error: Position must be between 0 and n

Update:
For newer versions of dplyr (>= 0.7.0):
library(tidyverse)
sort_names <- function(data) {
name <- names(data)
chars <- keep(name, grepl, pattern = "[^0-9]") %>% sort()
nums <- discard(name, grepl, pattern = "[^0-9]") %>%
as.numeric() %>%
sort() %>%
sprintf("%s", .)
select(data, !!!c(chars, nums))
}
sort_names(df)
Original:
You need back ticks around the numeric column names to stop select from trying to interpret them as column positions:
library(tidyverse)
sort_names <- function(data) {
name <- names(data)
chars <- keep(name, grepl, pattern = "[^0-9]") %>% sort()
nums <- discard(name, grepl, pattern = "[^0-9]") %>%
as.numeric() %>%
sort() %>%
sprintf("`%s`", .)
select_(data, .dots = c(chars, nums))
}
sort_names(df)

Related

Match strings before special character

I am trying to match strings in two columns and return mismatches before ":". It should not return if x2x, y67y, as x remains x and y remains as y.
I don't want to match the ":decimal". If x2y is in both columns then its a match (irrespective of the mismatch in the decimal after special character)
INPUT:
input <- structure(list(x = structure(c(1L, 2L, 3L, 3L), .Label = c("A",
"B", "C"), class = "factor"), y = structure(c(2L, 3L, 1L, 4L), .Label = c("A",
"B", "C", "D"), class = "factor"), x_val = c("x2x:0.12345,y67h:0.06732,d7j:0.032647",
"x2y:0.26345,y67y:0.28320,d7r:0.043647", "x2y:0.23435,y67y:0.28310,d7r:0.043547",
"x2y:0.23435,y67y:0.28330,d7r:0.043247"), y_val = c("x2y:0.33134,y67y:0.3131,d7r:0.23443",
"x2y:0.34311,y67y:0.14142,d7r:0.31431", "x2x:0.34314,y67h:0.14141,d7j:0.453145",
"x67b:0.31411,g72v:0.3134,b8c:0.89234")), row.names = c(NA, -4L
), class = "data.frame")
Output:
output <- structure(list(x = structure(c(1L, 2L, 3L, 3L), .Label = c("A",
"B", "C"), class = "factor"), y = structure(c(2L, 3L, 1L, 4L), .Label = c("A",
"B", "C", "D"), class = "factor"), x_val = c("x2x:0.12345,y67h:0.06732,d7j:0.032647",
"x2y:0.26345,y67y:0.28320,d7r:0.043647", "x2y:0.23435,y67y:0.28310,d7r:0.043547",
"x2y:0.23435,y67y:0.28330,d7r:0.043247"), y_val = c("x2y:0.33134,y67y:0.3131,d7r:0.23443",
"x2y:0.34311,y67y:0.14142,d7r:0.31431", "x2x:0.34314,y67h:0.14141,d7j:0.453145",
"x67b:0.31411,g72v:0.3134,b8c:0.89234"), diff_x = c("y67h:0.06732,d7j:0.03264",
NA, "x2y:0.23435,d7r:0.043547", "x2y:0.23435,y67y:0.28330,d7r:0.043247"
), diff_y = c("x2y:0.33134,d7r:0.23443", NA, "y67h:0.14141,d7j:0.453145",
"x67b:0.31411,g72v:0.3134,b8c:0.89234")), row.names = c(NA, -4L
), class = "data.frame")
I run into problem when I just want to match till ":" character. The following code is taken from this question: https://stackoverflow.com/a/55285959/5150629.
library(dplyr)
library(purrr)
I %>% mutate(diff_x = map2_chr(strsplit(x_val, split = ", "),
strsplit(y_val, split = ", "),
~paste(grep('([a-z])(?>\\d+)(?!\\1)', setdiff(.x, .y),
value = TRUE, perl = TRUE),
collapse = ", ")) %>%
replace(. == "", NA),
diff_y = map2_chr(strsplit(x_val, split = ", "),
strsplit(y_val, split = ", "),
~paste(grep('([a-z])(?>\\d+)(?!\\1)', setdiff(.y, .x),
value = TRUE, perl = TRUE),
collapse = ", ")) %>%
replace(. == "", NA))
Can anyone help?Thanks!
I modified my answer in https://stackoverflow.com/a/55285959/5150629 to fit this question:
library(dplyr)
library(purrr)
df %>%
mutate(
diff_x = map2_chr(
strsplit(x_val, split = ","),
strsplit(y_val, split = ","),
~ {
setdiff(sub(":.+$", "", .x), sub(":.+$", "", .y)) %>%
grep('([a-z])(?>\\d+)(?!\\1)', ., value = TRUE, perl = TRUE) %>%
sapply(grep, .x, value = TRUE) %>%
paste(collapse = ", ") %>%
replace(. == "", NA)
}
),
diff_y = map2_chr(
strsplit(x_val, split = ","),
strsplit(y_val, split = ","),
~ {
setdiff(sub(":.+$", "", .y), sub(":.+$", "", .x)) %>%
grep('([a-z])(?>\\d+)(?!\\1)', ., value = TRUE, perl = TRUE) %>%
sapply(grep, .y, value = TRUE) %>%
paste(collapse = ", ") %>%
replace(. == "", NA)
}
)
)
Output:
x y x_val y_val diff_x
1 A B x2x:0.12345,y67h:0.06732,d7j:0.032647 x2y:0.33134,y67y:0.3131,d7r:0.23443 y67h:0.06732, d7j:0.032647
2 B C x2y:0.26345,y67y:0.28320,d7r:0.043647 x2y:0.34311,y67y:0.14142,d7r:0.31431 <NA>
3 C A x2y:0.23435,y67y:0.28310,d7r:0.043547 x2x:0.34314,y67h:0.14141,d7j:0.453145 x2y:0.23435, d7r:0.043547
4 C D x2y:0.23435,y67y:0.28330,d7r:0.043247 x67b:0.31411,g72v:0.3134,b8c:0.89234 x2y:0.23435, d7r:0.043247
diff_y
1 x2y:0.33134, d7r:0.23443
2 <NA>
3 y67h:0.14141, d7j:0.453145
4 x67b:0.31411, g72v:0.3134, b8c:0.89234
Notes:
Since we are only interested in comparing the first part of the string format x1y:000000, I added a sub(":.+$", "", .x) for each map2_chr input argument to strip out the :000000 part first.
setdiff and the following grep steps work as expected to return the mismatches and exclude strings with the form x1x.
sapply(grep, .x, value = TRUE) after the first grep takes the vector of mismatches, and searches for their corresponding original strings (in x1y:000000 form).
paste collapses the vector of mismatches into a single comma separated list.

How do I split a column in R into two columns when I have no delimiter?

I have a dataset called data1 that I need to split the first column into two columns. The issue I'm having is that there is no delimiter between what I need to split and the character lengths are different is many rows.
I would like to split it by the date and sex.
E.g
12/1/09male
1/9/20female
13/1/19female
4/12/12male
I've been trying this but because the values have a different amount of characters I'm stuck.
separate(data1, col = 1, into = c("date","sex"), sep = "")
Any help would be hugely appreciated!
An option is a positive look-behind and look-ahead to split on a digit followed by an "m" or "f".
df %>% separate(1, c("date", "sex"), sep = "(?<=\\d)(?=[mf])")
# date sex
#1 12/1/09 male
#2 1/9/20 female
#3 13/1/19 female
#4 4/12/12 male
For what it's worth, the same regexp pattern works in base R's strsplit
setNames(do.call(
rbind.data.frame,
strsplit(as.character(df[, 1]), "(?<=\\d)(?=[mf])", perl = T)),
c("date", "sex"))
Sample data
df <- read.table(text =
'12/1/09male
1/9/20female
13/1/19female
4/12/12male')
I am fairly new to R so I am sure this is not the most elegant solution. I first add a comma between the date and sex and then separate on the comma
a <- data.frame(row_1 = c("12/1/09male", "1/9/20female", "13/1/19female", "4/12/12male"))
a[, "row_1"] = str_replace(a$row_1, "(male|female)", ",\\1")
separate(a, row_1, ",", into = c("date", "sex"))
Using tidyr::extract, we can capture data into two parts. First capture the date (in the format d/m/y) and second capture all the remaining part of the string.
tidyr::extract(df, V1, c("date", "sex"), "(\\d+/\\d+/\\d+)(.*)")
# date sex
#1 12/1/09 male
#2 1/9/20 female
#3 13/1/19 female
#4 4/12/12 male
data
df <- structure(list(V1 = structure(c(2L, 1L, 3L, 4L), .Label = c("1/9/20female",
"12/1/09male", "13/1/19female", "4/12/12male"), class = "factor")),
class = "data.frame", row.names = c(NA,-4L))
Base R solution using gsub and some regex:
df_clean <- within(df, {
date <- as.Date(gsub("[A-Za-z]+", "", V1), format = "%d/%m/%y")
sex <- as.factor(gsub("\\d+|\\/", "", V1))
rm(V1)
}
)
Data:
df <- structure(list(V1 = structure(c(2L, 1L, 3L, 4L), .Label = c("1/9/20female",
"12/1/09male", "13/1/19female", "4/12/12male"), class = "factor")), class = "data.frame", row.names = c(NA,
-4L))

How to convert a string to a variable and to loop through group_by?

Assume I have a dataset with two columns, Location and Product, that shows how many of each product is sold at each location. I create a contingency table for the number of each product sold at each location:
data%>%
group_by(Location,Product)%>%
summarize(n=n()) %>%
pivot_wider(names_from = product, values_from = n)
Now, imagine that instead of a single Product column, I have US_Product, Japan_Product,..., Germany_Product. How can I create my contingency tables in a for loop?
NOTE: when I create a vector of products like p<-c("Product1", "Product2",..., "Product3") and loop through these products, I get an error message because these are strings and not variable names.
Here is a minimal example:
Location <- c("AB","ON","MN","AB","ON")
Product1<-c("Type1","Type2","Type1","Type3","Type1")
Product2<-c("Type3","Type2","Type3","Type3","Type2")
Product3<-c("Type1","Type2","Type1","Type1","Type1")
data <- tibble(Location,Product1,Product2,Product3)
data%>%
group_by(Location,Product1)%>%
summarize(n=n()) %>%
pivot_wider(names_from = Product1, values_from = n) #this works as expected
#now I want to do the same thing in a loop
prodV <- c("Product1","Product2","Product3")
for (i in c(1:3)){
var <- prodV[i]
data%>%
group_by(Location,var)%>%
summarize(n=n()) %>%
pivot_wider(names_from = var, values_from = n)
}
If we need to use it in a loop, then one option is map
library(dplyr)
library(purrr)
library(tidyr)
map(p, ~
data%>%
group_by_at(vars("Location", .x)) %>%
summarize(n=n()) %>%
pivot_wider(names_from = .x, values_from = n))
Using a reproducible example
data(mtcars)
p <- c("cyl", "vs", "am")
map(p, ~
mtcars %>%
group_by_at(vars('gear', .x)) %>%
summarise(n = n()) %>%
pivot_wider(names_from = .x, values_from = n) )
Or if we use a for loop, then create an empty list to store the output from each iteration ('out'), loop over the 'p' values, and change only the .x part from map while assigning the output to each element of 'out' list
out <- vector('list', length(p))
names(out) <- p
for(p1 in p) {
out[[p1]] <- data %>%
group_by_at(vars("Location", p1)) %>%
summarize(n = n()) %>%
pivot_wider(names_from = p1, values_from = n)
}
Not sure if the following is the thing you are after. Below is a base R solution to make contingency tables:
p <- c("US_Product","Japan_product","Germany_Product")
res <- Map(function(x) table(df[c("Location",x)]),p)
such that
> res
$US_Product
US_Product
Location a b c
XX 2 0 1
YY 1 1 2
$Japan_product
Japan_product
Location d e f
XX 0 2 1
YY 3 0 1
$Germany_Product
Germany_Product
Location g i j
XX 0 3 0
YY 1 1 2
Dummy DATA
df <- > dput(df)
structure(list(Location = structure(c(1L, 1L, 1L, 2L, 2L, 2L,
2L), .Label = c("XX", "YY"), class = "factor"), US_Product = structure(c(1L,
3L, 1L, 2L, 1L, 3L, 3L), .Label = c("a", "b", "c"), class = "factor"),
Japan_product = structure(c(2L, 2L, 3L, 3L, 1L, 1L, 1L), .Label = c("d",
"e", "f"), class = "factor"), Germany_Product = structure(c(2L,
2L, 2L, 2L, 3L, 1L, 3L), .Label = c("g", "i", "j"), class = "factor")), class = "data.frame", row.names = c(NA,
-7L))
I was able to handle the problem using group_by_at as opposed to group_by. According to dplyr: whats the difference between group_by and group_by_ functions?
if one needs to have inputs with quotation marks, SE versions of functions should be used, instead of NSE versions---please see the link for a detailed explanation.
prodV <- c("Product1","Product2","Product3")
for (i in c(1:3)){
var <- prodV[i]
a<-data%>%
group_by_at(vars("Location",var))%>%
summarize(n=n()) %>%
pivot_wider(names_from = var, values_from = n)
print(a)
}

Get single column of values comparing multiple columns

I have just started my journey with R. I want to test values across multiple columns for the same condition and return 5 if any of the values is "hello" within a row:
result = ifelse((myData[1] == "hello") | (myData[2] == "hello") | (myData[3] == "hello"), 5, 0)
This works fine, but code seems to be redundant. When I do:
resultSec = ifelse(myData[1:3] == "hello", 5, 0)
Then all 3 columns are checked against the condition, but the result I get is not a single column, but 3 columns. So then I would have to perform an additional comparison for all columns which makes totally more lines of code then the first redundant method.
How can I get in this case a one column of values in efficient way ?
You can use the function apply() to iterate over a data.frame or matrix, by either columns or rows. The margin argument determines which one you use.
Here we want to check the rows, so we use margin = 1:
dat <- data.frame(col1 = c("happy", "sad", "mad"),
col2 = c("tired", "sleepy", "happy"),
col3 = c("relaxed", "focused", "fine"))
dat$res <- apply(X = dat, MARGIN = 1,
FUN = function(x) ifelse("happy" %in% x, 5, 0))
dat
col1 col2 col3 res
1 happy tired relaxed 5
2 sad sleepy focused 0
3 mad happy fine 5
We can use rowSums here
df1$res <- rowSums(df1 == "happy") * 5
df1$res
#[1] 5 0 5
data
df1 <- structure(list(col1 = structure(c(1L, 3L, 2L), .Label = c("happy",
"mad", "sad"), class = "factor"), col2 = structure(c(3L, 2L,
1L), .Label = c("happy", "sleepy", "tired"), class = "factor"),
col3 = structure(c(3L, 2L, 1L), .Label = c("fine", "focused",
"relaxed"), class = "factor")), .Names = c("col1", "col2",
"col3"), row.names = c(NA, -3L), class = "data.frame")

Convert lines from factor to numeric?

this example:
dat=structure(list(X = structure(c(1L, 2L,3L), .Label = c("A", "B", "C"), class = "factor"), X10 = structure(c(1L,2L,3L), .Label = c("3","0", "2"), class = "factor"), X11 = structure(c(1L, 2L,3L), .Label = c("0", "2", "0"), class = "factor")), class = "data.frame", row.names = c(NA, -3L))
dat=dat[,-1]
fi=as.numeric(as.character(dat[1,] ))
> fi
[1] 1 1
Which is not correct. I wonder what is wrong ?
as.numeric is for vector, you need to use apply if you want to apply this to a data frame:
apply(dat, MARGIN=2,FUN=as.numeric)
result:
X10 X11
[1,] 3 0
[2,] 0 2
[3,] 2 0
For multiple columns of different class, we can have a check whether it is factor or not to do the conversion
library(dplyr)
dat %>%
mutate_if(is.factor, funs(as.numeric(as.character(.))))
and if all the columns are factor, then use mutate_all
dat %>%
mutate_all(funs(as.numeric(as.character(.))))
The base R way if all columns are factor, use lapply and assign it to the original object
dat[] <- lapply(dat, function(x) as.numeric(as.character(x)))

Resources