concatenating and Indexing highest values by grouped columns - r

Hello guys I hope you are doing great I have the following dataset,
I have the following dataset:
A1
A2
A3
A4
A5
A6
A7
-1.2650612
-0.6868529
-0.4456620
1.2240818
0.3598138
0.4007715
0.1106827
colnames<-c(paste0(1:7,"A"))
set.seed(123)
values<-c(rnorm(7))
I want to be able to create a code that will find the highest value (row-wise)found within the first 3 columns (A1,A2,A3) and write the corresponding column name, repeat the same for the following 2 columns and the same for the remaining 2 columns (A6,A7), if all values are the same (maybe it can happen then just write the name of the first columns withint the group)
mu desired output will be:
A1,A2,A3
A4,A5
A6,7
A3
A4
A5
Thank you gusy so much, I will be very attending to reward the ans to whom can helpe me!

Here is one option
library(dplyr)
library(tibble)
library(tidyr)
pivot_longer(df1, everything()) %>%
group_by(group = rep(1:3, c(3, 2, 2))) %>%
summarise(name1 = name[which.max(value)],
name2 = toString(name)) %>%
select(name2, name1) %>%
deframe %>%
as_tibble_row
-output
# A tibble: 1 × 3
`A1, A2, A3` `A4, A5` `A6, A7`
<chr> <chr> <chr>
1 A3 A4 A6
data
df1 <- structure(list(A1 = -1.2650612, A2 = -0.6868529, A3 = -0.445662,
A4 = 1.2240818, A5 = 0.3598138, A6 = 0.4007715, A7 = 0.1106827),
class = "data.frame", row.names = c(NA,
-1L))

Related

Using dplyr to sort a table by two alphanumeric columns

I have a dataset that looks like this:
library(dplyr)
Data <- tibble(
Area1 = rep(c("A1 1AA", "B3 4TT","D1 1AA", "A10 6TY","A2 9GG"),2),
Area2 = c("A2 7BB", "B11 5TT","A14 9SS","A4 4HH","V6 9FF", "A11 6TT","B4 3DD","D1 4FF","G5 7DD","A2 7YY"))
I would like to sort it by Area1 and then Area2, however arrange does not produce the desired result because it's in lexicographical order.
Data %>% arrange(Area1,Area2) #not the desired order
Is there a way using dplyr to produce this output that is in the desired order?
Output <- tibble(
Area1 = c("A1 1AA", "A1 1AA", "A2 9GG","A2 9GG","A10 6TY","A10 6TY", "B3 4TT","B3 4TT","D1 1AA","D1 1AA"),
Area2 = c("A2 7BB", "A11 6TT","A2 7YY","V6 9FF","A4 4HH", "G5 7DD","B4 3DD","B11 5TT","A14 9SS","D1 4FF"))
Here is another option using arrange() and str_sort():
library(dplyr)
library(stringr)
Data %>%
arrange(across(starts_with("Area"), ~match(.x, str_sort(unique(.x), numeric = TRUE))))
# A tibble: 10 x 2
Area1 Area2
<chr> <chr>
1 A1 1AA A2 7BB
2 A1 1AA A11 6TT
3 A2 9GG A2 7YY
4 A2 9GG V6 9FF
5 A10 6TY A4 4HH
6 A10 6TY G5 7DD
7 B3 4TT B4 3DD
8 B3 4TT B11 5TT
9 D1 1AA A14 9SS
10 D1 1AA D1 4FF
Seems like we can use mixedorder with slice
library(dplyr)
library(gtools)
library(stringr)
Output2 <- Data %>%
slice(mixedorder(str_c(Area1, Area2)))
Or another option is to remove the numeric, non-numeric separately and use that in arrange
Output3 <- Data %>%
arrange(str_remove_all(Area1, "\\d+"),
readr::parse_number(Area1),
str_remove_all(Area2, "\\d+"),
readr::parse_number(Area2))
-checking with OP's expected
identical(Output, Output2)
#[1] TRUE
identical(Output, Output3)
#[1] TRUE

R convert matrix into table

there is a matrix:
mat<-matrix(0,ncol = 10, nrow = 5)
colnames(mat)<-c("A1","A2","A3","A4","A5","A6","A7","A8","A9","A10")
rownames(mat)<-c("ID_1", "ID_2", "ID_3", "ID_4", "ID_5")
mat[1,] <-c(0,0,1,1,1,1,0,0,0,0)
mat[2,]<-c(0,0,0,1,1,1,0,0,0,0)
mat[3,]<-c(0,0,0,1,1,1,1,1,1,0)
mat[4,]<-c(0,0,0,0,0,1,1,1,1,0)
mat[5,]<-c(0,0,0,0,0,0,1,1,1,1)
I want to convert this matrix into a table with three columns - "ID", "start" and "stop", where "start" is a column with the first value (1) in row "ID", "stop" is a column with the last value in the row. I would like to receive this output:
Could You please help me?
Thanks in advance.
Here is one way using dplyr, converting the matrix to dataframe, convert rownames to column, get the data in long format, filter rows with value = 1 and select first and last column name for each id.
library(dplyr)
mat %>%
as.data.frame() %>%
tibble::rownames_to_column('id') %>%
tidyr::pivot_longer(cols = -id) %>%
filter(value == 1) %>%
group_by(id) %>%
summarise(start = first(name), stop = last(name))
# A tibble: 5 x 3
# id start stop
# <chr> <chr> <chr>
#1 ID_1 A3 A6
#2 ID_2 A4 A6
#3 ID_3 A4 A9
#4 ID_4 A6 A9
#5 ID_5 A7 A10
In base R and keeping mat as matrix :
t(apply(mat, 1, function(x) {
inds <- which(x == 1)
c(start = colnames(mat)[min(inds)], stop = colnames(mat)[max(inds)])
}))
You can do this using the ties.method argument in max.col. Use the result to subset the colnames.
data.frame(id = rownames(mat),
start = colnames(mat)[max.col(mat, "first")],
stop = colnames(mat)[max.col(mat, "last")])
# id start stop
# 1 ID_1 A3 A6
# 2 ID_2 A4 A6
# 3 ID_3 A4 A9
# 4 ID_4 A6 A9
# 5 ID_5 A7 A10

Add missing index in a dataframe

Hi I have a messy data frame as follows:
df <- data.frame(age.band = c("0-5","5-10"), beg.code = c("A1","B1"), end.code=c("A5","B3"),value = c(10,5))
age.band beg.code end.code value
0-5 A1 A5 10
5-10 B1 B3 5
I would like to transform it into a friendlier format such as:
index age.band value
A1 0-5 10
A2 0-5 10
A3 0-5 10
A4 0-5 10
A5 0-5 10
B1 5-10 5
B2 5-10 5
B3 5-10 5
Can anyone help me to find a way to add all the missing indexes for this dataframe? Thanks
A solution using dplyr and tidyr. Nptice that I added stringsAsFactors = FALSE to avoid creating factor columns when creating your example data frame. If you run the code on your original data frame, you will receive warning message due to the factor columns, but it will not affect the end results.
library(dplyr)
library(tidyr)
df2 <- df %>%
gather(Code, Value, ends_with("code")) %>%
extract(Value, into = c("Group", "Index"), regex = "([A-Za-z+].*)([\\d].*$)",
convert = TRUE) %>%
select(-Code) %>%
group_by(Group) %>%
complete(Index = full_seq(Index, period = 1)) %>%
unite(Index, c("Group", "Index"), sep = "") %>%
fill(-Index)
df2
# # A tibble: 8 x 3
# Index age.band value
# * <chr> <chr> <dbl>
# 1 A1 0-5 10
# 2 A2 0-5 10
# 3 A3 0-5 10
# 4 A4 0-5 10
# 5 A5 0-5 10
# 6 B1 5-10 5
# 7 B2 5-10 5
# 8 B3 5-10 5
DATA
df <- data.frame(age.band = c("0-5","5-10"), beg.code = c("A1","B1"), end.code=c("A5","B3"),value = c(10,5),
stringsAsFactors = FALSE)
Here is one option with base R. The idea is to remove the non-numeric characters from the 'code' columns, convert it to numeric and get the sequence stored as a list. Then, paste the non-numeric characters and finally, based on the lengths of the list, expand the rows of the original dataset with rep and create a new column 'index' by unlisting the list
lst <- do.call(Map, c(f = `:`, lapply(df[2:3], function(x) as.numeric(sub("\\D+", "", x)))))
lst1 <- Map(paste0, substr(df[,2], 1, 1), lst)
data.frame(index = unlist(lst1), df[rep(seq_len(nrow(df)), lengths(lst1)), -(2:3)])

Sorting the values of column in ascending order in R

The script below is a data frame of four columns. My need is that I want to take a pair of values(a1,a2) at a time. The column "a3" is such that if you check a pair say (a1,a2), as you span the data, the pair's value is arranged in ascending order. If there is a duplicate of the pair present in the table, I want the "a4" column values to be arranged just like the corresponding "a3" column in ascending order for the particular (a1,a2) value. Say the first (a1,a2) pair ("A","D"), the pair appears thrice and the corresponding a3 values are in asecending order. Similarly I wish to arrange the a4 values based on the order of a4 values in ascending order. Please check the expected outcome. Thanks and please suggest.
a1 = c("A","B","C","A","B","C","A","C")
a2 = c("D","E","F","D","E","E","D","F")
a3 = c(5,15,12,10,40,35,20,50)
a4 = c(100,160,66,65,130,150,80,49)
a123= data.frame(a1,a2,a3,a4)
library(dplyr)
a123_r <- a123 %>%
group_by(a1, a2) %>%
mutate(a3 = sort(a3)) %>%
ungroup()
a123_r
Expected Output
a1 = c("A","B","C","A","B","C","A","C")
a2 = c("D","E","F","D","E","E","D","F")
a3 = c(5,15,12,10,40,35,20,50)
a4 = c(65,130,66,80,160,150,100,49)
a123_r <- data.frame(a1,a2,a3,a4)
For the sake of completeness, here is an answer using data.table:
library(data.table)
cols <- c("a3", "a4")
setDT(a123)[, (cols) := lapply(.SD, sort), by = .(a1, a2), .SDcols = cols][]
a1 a2 a3 a4
1: A D 5 65
2: B E 15 130
3: C F 12 49
4: A D 10 80
5: B E 40 160
6: C E 35 150
7: A D 20 100
8: C F 50 66
Data
a1 = c("A","B","C","A","B","C","A","C")
a2 = c("D","E","F","D","E","E","D","F")
a3 = c(5,15,12,10,40,35,20,50)
a4 = c(100,160,66,65,130,150,80,49)
a123= data.frame(a1,a2,a3,a4)

Convert columns i to j to percentage

Suppose I have the following data:
df1 <- data.frame(name=c("A1","A1","B1","B1"),
somevariable=c(0.134,0.5479,0.369,NA),
othervariable=c(0.534, NA, 0.369, 0.3333))
In this example, I want to convert columns 2 and 3 to percentages (with one decimal point). I can do it with this code:
library(scales)
df1 %>%
mutate(somevariable=try(percent(somevariable),silent = T),
othervariable=try(percent(othervariable),silent = T))
But I'm hoping there is a better way, particularly for the case where I have many columns instead of just 2.
I tried mutate_each but I'm doing something wrong...
df1 %>%
mutate_each(funs = try(percent(),silent = T), -name)
Thanks!
Here's an alternative approach using custom function. This function will only modify numeric vectors, so no need to worry about try or removing non-numeric columns. It will also handle NAs by defult
myfun <- function(x) {
if(is.numeric(x)){
ifelse(is.na(x), x, paste0(round(x*100L, 1), "%"))
} else x
}
df1 %>% mutate_each(funs(myfun))
# name somevariable othervariable
# 1 A1 13.4% 53.4%
# 2 A1 54.8% <NA>
# 3 B1 36.9% 36.9%
# 4 B1 <NA> 33.3%
Try
df1 %>%
mutate_each(funs(try(percent(.), silent=TRUE)), -name)
# name somevariable othervariable
#1 A1 13.4% 53.4%
#2 A1 54.8% NA%
#3 B1 36.9% 36.9%
#4 B1 NA% 33.3%
if you need to filter out the NAs from getting the percentage,
df1 %>%
mutate_each(funs(try(ifelse(!is.na(.), percent(.), NA),
silent=TRUE)),-name)
# name somevariable othervariable
#1 A1 13.4% 53.4%
#2 A1 54.8% <NA>
#3 B1 36.9% 36.9%
#4 B1 <NA> 33.3%

Resources