Paste letter index on string R - r

I want to paste a number and some letters together to index them. The columns of my dataframe are as follows;
When CNTR is NA, i want it to be the booking number + an index, so for booking 202653 for example, I want it to be 202653A and 202653B. I already achieved pasting the booking numbers into the CNTR column when its empty with;
dfUNIT$CNTR <- ifelse(is.na(dfUNIT$CNTR), dfUNIT$BOOKING, dfUNIT$CNTR)
which gives me the following table;
But as I said, I need unique CNTR values. My dataframe contains thousands of rows and changes frequently, is there a way to 'index' them the way I want (A, B, C etc)? Thank you in advance

I'll make up some data,
dat <- data.frame(B=c(202658,202654,202653,202653),C=c("TCLU","KOCU",NA,NA))
dplyr
library(dplyr)
dat %>%
group_by(B) %>%
mutate(C = if_else(is.na(C), paste0(B, LETTERS[row_number()]), C))
# # A tibble: 4 x 2
# # Groups: B [3]
# B C
# <dbl> <chr>
# 1 202658 TCLU
# 2 202654 KOCU
# 3 202653 202653A
# 4 202653 202653B
A fundamental risk in this is if you ever have more than 26 rows for a booking, in which case the letter-suffix will fail. An alternative is to append a number instead (e.g., paste0(B, "_", row_number()) or add some other safeguards.
base R alternatives
do.call(rbind, by(dat, dat[,"B",drop=FALSE],
FUN = function(z) transform(z,
C = ifelse(is.na(C), paste0(B, LETTERS[seq_along(z$C)]), C)
)
))
or
append <- ave(dat$C, dat$B, FUN = function(z) ifelse(is.na(z), LETTERS[seq_along(z)], ""))
append
# [1] "" "" "A" "B"
dat$C <- paste0(ifelse(is.na(dat$C), dat$B, dat$C), append)
dat
# B C
# 1 202658 TCLU
# 2 202654 KOCU
# 3 202653 202653A
# 4 202653 202653B

If you don't insist on using letters to index the transformations, here's arough and ready dplyr solution based on rleid from the data.table package:
library(dplyr)
library(data.table)
df %>%
group_by(grp = rleid(B)) %>%
mutate(CNTR_new = if_else(is.na(CNTR), paste0(B, "_", grp), CNTR))
# A tibble: 7 x 4
# Groups: grp [5]
B CNTR grp CNTR_new
<dbl> <chr> <int> <chr>
1 12 TCU 1 TCU
2 13 NA 2 13_2
3 13 NA 2 13_2
4 15 NA 3 15_3
5 1 PVDU 4 PVDU
6 1 NA 4 1_4
7 5 NA 5 5_5
Data:
df <- data.frame(
B = c(12,13,13,15,1,1,5),
CNTR = c("TCU", NA, NA, NA, "PVDU", NA, NA)
)

Related

Extract first Non NA value over multiple columns

I'm still learning R and was wondering if I there was an elegant way of manipulating the below df to achieve df2.
I'm not sure if it's a loop that is supposed to be used for this, but basically I want to extract the first Non NA "X_No" Value if the "X_No" value is NA in the first row. This would perhaps be best described through an example from df to the desired df2.
A_ID <- c('A','B','I','N')
A_No <- c(11,NA,15,NA)
B_ID <- c('B','C','D','J')
B_No <- c(NA,NA,12,NA)
C_ID <- c('E','F','G','P')
C_No <- c(NA,13,14,20)
D_ID <- c('J','K','L','M')
D_No <- c(NA,NA,NA,40)
E_ID <- c('W','X','Y','Z')
E_No <- c(50,32,48,40)
df <- data.frame(A_ID,A_No,B_ID,B_No,C_ID,C_No,D_ID,D_No,E_ID,E_No)
ID <- c('A','D','F','M','W')
No <- c(11,12,13,40,50)
df2 <- data.frame(ID,No)
I'm hoping for an elegant solution to this as there are over a 1000 columns similar to the example provided.
I've looked all over the web for a similar example however to no avail that would reproduce the expected result.
Your help is very much appreciated.
Thankyou
I don't know if I'd call it "elegant", but here is a potential solution:
library(tidyverse)
A_ID <- c('A','B','I','N')
A_No <- c(11,NA,15,NA)
B_ID <- c('B','C','D','J')
B_No <- c(NA,NA,12,NA)
C_ID <- c('E','F','G','P')
C_No <- c(NA,13,14,20)
D_ID <- c('J','K','L','M')
D_No <- c(NA,NA,NA,40)
E_ID <- c('W','X','Y','Z')
E_No <- c(50,32,48,40)
df <- data.frame(A_ID,A_No,B_ID,B_No,C_ID,C_No,D_ID,D_No,E_ID,E_No)
ID <- c('A','D','F','M','W')
No <- c(11,12,13,40,50)
df2 <- data.frame(ID,No)
output <- df %>%
pivot_longer(everything(),
names_sep = "_",
names_to = c("Col", ".value")) %>%
drop_na() %>%
group_by(Col) %>%
slice_head(n = 1) %>%
ungroup() %>%
select(-Col)
df2
#> ID No
#> 1 A 11
#> 2 D 12
#> 3 F 13
#> 4 M 40
#> 5 W 50
output
#> # A tibble: 5 × 2
#> ID No
#> <chr> <dbl>
#> 1 A 11
#> 2 D 12
#> 3 F 13
#> 4 M 40
#> 5 W 50
all_equal(df2, output)
#> [1] TRUE
Created on 2023-02-08 with reprex v2.0.2
Using base R with max.col (assuming the columns are alternating with ID, No)
ind <- max.col(!is.na(t(df[c(FALSE, TRUE)])), "first")
m1 <- cbind(seq_along(ind), ind)
data.frame(ID = t(df[c(TRUE, FALSE)])[m1], No = t(df[c(FALSE, TRUE)])[m1])
ID No
1 A 11
2 D 12
3 F 13
4 M 40
5 W 50
Here is a data.table solution that should scale well to a (very) large dataset.
functionally
split the data.frame to a list of chunks of columns, based on their
names. So all columns startting with A_ go to
the first element, all colums startting with B_ to the second
Then, put these list elements on top of each other, using
data.table::rbindlist. Ignure the column-namaes (this only works if
A_ has the same number of columns as B_ has the same number of cols
as n_)
Now get the first non-NA value of each value in the first column
code
library(data.table)
# split based on what comes after the underscore
L <- split.default(df, f = gsub("(.*)_.*", "\\1", names(df)))
# bind together again
DT <- rbindlist(L, use.names = FALSE)
# extract the first value of the non-NA
DT[!is.na(A_No), .(No = A_No[1]), keyby = .(ID = A_ID)]
# ID No
# 1: A 11
# 2: D 12
# 3: F 13
# 4: G 14
# 5: I 15
# 6: M 40
# 7: P 20
# 8: W 50
# 9: X 32
#10: Y 48
#11: Z 40

performing multiple functions on every columns and rows of a list of dataframes using their names stored in a list

DATA
foo <- dplyr::tibble(a=c("a","b",NA),b=c("a","b","c"),colC=NA)
bar <- dplyr::tibble(a=c("a","b",NA),b=c("a","b","c"),colC=NA)
all_tibbles <- c("foo","bar")
lapply(mget(all_list), function(y) sapply(y, function(x) all(is.na(x))))
$foo
# A tibble: 3 x 3
a b colC
<chr> <chr> <lgl>
1 a a NA
2 b b NA
3 NA c NA
$bar
# A tibble: 3 x 3
a b colC
<chr> <chr> <lgl>
1 a a NA
2 b b NA
3 NA c NA
I would like to remove all columns from every data frame in mget(all_list)
This created the logical vector using base apply functions.
lapply(mget(all_tibbles), function(y) sapply(y, function(x) all(is.na(x))))
Then remove all rows with the minimum number of missing values
lapply(mget(all_tibbles),function(x)
x[-which.min(rowSums((!is.na(x)))),])
and then store these back in the same variables foo and bar. I have a large character vector with tibble names btw.
Can I use a tidyr package to simplify things? base functions are fairly complicated, and am trying to avoid for loops
An option is select_if
library(dplyr)
library(purrr)
library(stringr)
out <- mget(all_tibbles) %>%
map(~ .x %>%
select_if(~ any(!is.na(.))))
out
#$foo
# A tibble: 3 x 2
# a b
# <chr> <chr>
#1 a a
#2 b b
#3 <NA> c
#$bar
# A tibble: 3 x 2
# a b
# <chr> <chr>
#1 a a
#2 b b
#3 <NA> c
names(out) <- str_c(names(out), "_edited")
If we need to update "foo", "bar" (not recommended)
list2env(out, .GlobalEnv)
Or using keep
mget(all_tibbles) %>%
map(~ keep(.x, colSums(!is.na(.)) > 0))
For the second case with rows
out1 <- mget(all_tibbles) %>%
map(~ .x %>%
slice(-which.min(rowSums(!is.na(.)))))
names(out2) <- str_c(names(out), "_edited2")
list2env(out2, .GlobalEnv)
Or we can use Filter from base R to remove columns (OP already showed a base R option for removing rows)
lapply(mget(all_tibbles), function(x)
Filter(function(y) any(!is.na(y)), x))

How to calculate rowMeans of columns with similar colnames in r?

I have a data frame with similar colnames.
I want to calculate rowMeans of columns A and B.
How can I do rowMeans between all A and B columns?
df <- data.frame(A1=c(1,2),A2=c(3,4),A3=c(5,6),A4=c(7,7),A5=c(8,8),A6=c(9,9))
colnames(df)<- c("A","A","B","B","B","C")
An option would be split by the similar column names into a list and then get the rowMeans
i1 <- grep("^(A|B)", names(df))
sapply(split.default(df[i1], names(df)[i1]), rowMeans)
# A B
#[1,] 2 6.666667
#[2,] 3 7.000000
We can iterate over unique names, subset them from original dataframe and take rowMeans.
sapply(c("A", "B"), function(x) rowMeans(df[,colnames(df) == x]))
# A B
#[1,] 2 6.67
#[2,] 3 7.00
An other option using the tidyverse:
library(tidyverse)
df[, "rn"] <- 1:nrow(df)
df %>%
gather(letter, value, -rn) %>%
mutate(letter = str_extract(letter, "[:alpha:]")) %>%
group_by(letter, rn) %>%
summarize(sum = mean(value)) %>%
filter(letter %in% c("A", "B"))
#> # A tibble: 4 x 3
#> # Groups: letter [2]
#> letter rn sum
#> <chr> <int> <dbl>
#> 1 A 1 2
#> 2 A 2 3
#> 3 B 1 6.67
#> 4 B 2 7
You would simply need to submit the dataframe by the columns you want, and then apply the rowMeans() function.
df <- data.frame(A1=c(1,2),A2=c(3,4),A3=c(5,6),A4=c(7,7),A5=c(8,8),A6=c(9,9))
colnames(df)<- c("A","A","B","B","B","C")
rowSums(df[,which(colnames(df) %in% c("A","B"))])
#[1] 24 27
However, as r2evans pointed out in the comment, you should avoid columns with the same names. You would just want to get the position of the columns that determine the start and end of the number of columns between and subset.
colnames(df) <- c(paste0("A",1:2), paste0("B", 1:3), "C1")
strt <- which(colnames(df) == "A1")
end <- which(colnames(df) == "B3")
columrange <- strt:end
rowSums(df[,columrange])
#[1] 24 27
There are many ways to subset by column names. If you didn't rename your columns in your example, you could use grepl() to find them:
df[,grepl("A",colnames(df)) | grepl("B",colnames(df))]
# A1 A2 B1 B2 B3
#1 1 3 5 7 8
#2 2 4 6 7 8

Group data by factor level, then transform to data frame with colname being levels?

There is my problem that I can't solve it:
Data:
df <- data.frame(f1=c("a", "a", "b", "b", "c", "c", "c"),
v1=c(10, 11, 4, 5, 0, 1, 2))
data.frame:f1 is factor
f1 v1
a 10
a 11
b 4
b 5
c 0
c 1
c 2
# What I want is:(for example, fetch data with the number of element of some level == 2, then to data.frame)
a b
10 4
11 5
Thanks in advance!
I might be missing something simple here , but the below approach using dplyr works.
library(dplyr)
nlevels = 2
df1 <- df %>%
add_count(f1) %>%
filter(n == nlevels) %>%
select(-n) %>%
mutate(rn = row_number()) %>%
spread(f1, v1) %>%
select(-rn)
This gives
# a b
# <int> <int>
#1 10 NA
#2 11 NA
#3 NA 4
#4 NA 5
Now, if you want to remove NA's we can do
do.call("cbind.data.frame", lapply(df1, function(x) x[!is.na(x)]))
# a b
#1 10 4
#2 11 5
As we have filtered the dataframe which has only nlevels observations, we would have same number of rows for each column in the final dataframe.
split might be useful here to split df$v1 into parts corresponding to df$f1. Since you are always extracting equal length chunks, it can then simply be combined back to a data.frame:
spl <- split(df$v1, df$f1)
data.frame(spl[lengths(spl)==2])
# a b
#1 10 4
#2 11 5
Or do it all in one call by combining this with Filter:
data.frame(Filter(function(x) length(x)==2, split(df$v1, df$f1)))
# a b
#1 10 4
#2 11 5
Here is a solution using unstack :
unstack(
droplevels(df[ave(df$v1, df$f1, FUN = function(x) length(x) == 2)==1,]),
v1 ~ f1)
# a b
# 1 10 4
# 2 11 5
A variant, similar to #thelatemail's solution :
data.frame(Filter(function(x) length(x) == 2, unstack(df,v1 ~ f1)))
My tidyverse solution would be:
library(tidyverse)
df %>%
group_by(f1) %>%
filter(n() == 2) %>%
mutate(i = row_number()) %>%
spread(f1, v1) %>%
select(-i)
# # A tibble: 2 x 2
# a b
# * <dbl> <dbl>
# 1 10 4
# 2 11 5
or mixing approaches :
as_tibble(keep(unstack(df,v1 ~ f1), ~length(.x) == 2))
Using all base functions (but you should use tidyverse)
# Add count of instances
x$len <- ave(x$v1, x$f1, FUN = length)
# Filter, drop the count
x <- x[x$len==2, c('f1','v1')]
# Hacky pivot
result <- data.frame(
lapply(unique(x$f1), FUN = function(y) x$v1[x$f1==y])
)
colnames(result) <- unique(x$f1)
> result
a b
1 10 4
2 11 5
I'd like code this, may it helps for you
library(reshape2)
library(dplyr)
aa = data.frame(v1=c('a','a','b','b','c','c','c'),f1=c(10,11,4,5,0,1,2))
cc = aa %>% group_by(v1) %>% summarise(id = length((v1)))
dd= merge(aa,cc) #get the level
ee = dd[dd$aa==2,] #select number of level equal to 2
ee$id = rep(c(1,2),nrow(ee)/2) # reset index like (1,2,1,2)
dcast(ee, id~v1,value.var = 'f1')
all done!

How can I replace a factor levels with the top n levels (by number of occurances)

This question is related to How can I replace a factor levels with the top n levels (by some metric), plus [other]?. As a metric I want to use the number of occurrences of the factor. I know I can do it by making a list of the occurrences, but I was wondering if there is a prettier way.
Example:
library(data.table);
library(plyr);
fac <- data.table(score = as.factor(c(3,4,5,3,3,3,5)));
ocCnt <- data.table(lapply(fac,count)$score);
fac$occurrence <- 0;
for(i in 1:length(fac$score)){fac$occurrence[i]<-ocCnt[x==fac$score[i]]$freq};
Then I could use the function described in the referenced question/answer:
hotfactor= function(fac,by,n=10,o="other") {
levels(fac)[rank(-xtabs(by~fac))[levels(fac)]>n] <- o
fac
}
To continue the example, if we want only to see the most popular factor we do:
hotfactor(fac$score,fac$occurrence,1);
To get the answer:
[1] 3 other other 3 3 3 other
Levels: 3 other
So my question is, can I do this without having to add a list which counts the occurrences?
Note that I want to do this for the n most popular factors (not just for the most popular factor).
Use table and which.max:
score <- factor(c(3,4,5,3,3,3,5))
levels(score)[- which.max(table(score))] <- "other"
#[1] 3 other other 3 3 3 other
#Levels: 3 other
Obviously this breaks ties by taking the first maximum value.
If you want to keep the top two levels:
score <- factor(c(3, 4,5,3,3,3,5), levels =c(4,3,5))
levels(score)[!levels(score) %in% names(sort(table(score), decreasing = TRUE)[1:2])] <- "other"
#[1] 3 other 5 3 3 3 5
#Levels: other 3 5
If you don't know how many levels you need to group say, 90% of your data and are willing to use dplyr, you could do something along the following lines:
library(dplyr)
df <- data.frame(
f = factor(mapply(rep, letters[1:5], 2^(1:5)) %>% unlist(use.names = F))
)
df %>%
count(f, sort = T) %>%
mutate(p = cumsum(n) / nrow(df))
# A tibble: 5 x 3
# f n p
# <fctr> <int> <dbl>
# 1 e 32 0.5161290
# 2 d 16 0.7741935
# 3 c 8 0.9032258
# 4 b 4 0.9677419
# 5 a 2 1.0000000
(top <- df %>%
count(f, sort = T) %>%
mutate(p = cumsum(n) / nrow(df)) %>%
filter(cumall(p < .91)) %>%
select(f) %>%
unlist(use.names = F))
# [1] e d c
# Levels: a b c d e
levels(df$f) <- factor(c(levels(df$f), 'z'))
df$f[!df$f %in% top] <- 'z'
df %>%
count(f, sort = T) %>%
mutate(p = cumsum(n) / nrow(df))
# A tibble: 4 x 3
# f n p
# <fctr> <int> <dbl>
# 1 e 32 0.5161290
# 2 d 16 0.7741935
# 3 c 8 0.9032258
# 4 z 6 1.0000000

Resources