Given df as follows:
# group value
# 1 A 8
# 2 A 1
# 3 A 7
# 4 B 3
# 5 B 2
# 6 B 6
# 7 C 4
# 8 C 5
df <- structure(list(group = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L,
3L), .Label = c("A", "B", "C"), class = "factor"), value = c(8L,
1L, 7L, 3L, 2L, 6L, 4L, 5L)), .Names = c("group", "value"), class = "data.frame", row.names = c(NA,
-8L))
And a vector of indices (possibly with NA):
inds <- c(2,1,NA)
How we can get the nth element of column value per group, preferably in base R?
For example, based on inds, we want the second element of value in group A, first element in group B, NA in group C. So the result would be:
#[1] 1 3 NA
Here is a solution with mapply and split:
mapply("[", with(df, split(value, group)), inds)
which returns a named vector
A B C
1 3 NA
with(df, split(value, group)) splits the data frame by group and returns a list of data frames. mapply takes that list and "inds" and applies the subsetting function "[" to each pairs of arguments.
Using levels and sapply you could do:
DF <- structure(list(group = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L,
3L), .Label = c("A", "B", "C"), class = "factor"), value = c(8L,
1L, 7L, 3L, 2L, 6L, 4L, 5L)), .Names = c("group", "value"), class = "data.frame", row.names = c(NA,
-8L))
inds <- c(2,1,NA)
lvls = levels(DF$group)
groupInds = sapply(1:length(lvls),function(x) DF$value[DF$group==lvls[x]][inds[x]] )
groupInds
#[1] 1 3 NA
Using again mapply (but not nearly as elegant as IMO's answer):
mapply(function(x, y) subset(df, group == x, value)[y,] ,levels(df$group), inds)
I know you said preferably in base R, but just for the record, here is a data.table way
setDT(df)[, .SD[inds[.GRP], value], by=group][,V1]
#[1] 1 3 NA
I just did come up with another solution:
diag(aggregate(value~group, df, function(x) x[inds])[,-1])
#[1] 1 3 NA
Benchmarking
library(microbenchmark)
library(data.table)
df <- structure(list(group = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L,
3L), .Label = c("A", "B", "C"), class = "factor"), value = c(8L,
1L, 7L, 3L, 2L, 6L, 4L, 5L)), .Names = c("group", "value"), class = "data.frame", row.names = c(NA,
-8L))
inds <- c(2,1,NA)
f_Imo <- function(df) as.vector(mapply("[", with(df, split(value, group)), inds))
f_Osssan <- function(df) {lvls = levels(df$group);sapply(1:length(lvls),function(x) df$value[df$group==lvls[x]][inds[x]])}
f_User2321 <- function(df) unlist(mapply(function(x, y) subset(df, group == x, value)[y,] ,levels(df$group), inds))
f_dww <- function(df) setDT(df)[, .SD[inds[.GRP], value], by=group][,V1]
f_m0h3n <- function(df) diag(aggregate(value~group, df, function(x) x[inds])[,-1])
all.equal(f_Imo(df), f_Osssan(df), f_User2321(df), f_dww(df), f_m0h3n(df))
# [1] TRUE
microbenchmark(f_Imo(df), f_Osssan(df), f_m0h3n(df), f_User2321(df), f_dww(df))
# Unit: microseconds
# expr min lq mean median uq max neval
# f_Imo(df) 71.004 85.1180 91.52996 91.748 96.8810 121.048 100
# f_Osssan(df) 252.788 276.5265 318.70529 287.648 301.5495 2651.492 100
# f_m0h3n(df) 1422.627 1555.4365 1643.47184 1618.740 1670.7095 4729.827 100
# f_User2321(df) 2889.738 3000.3055 3148.44916 3037.945 3118.7860 6013.442 100
# f_dww(df) 2960.740 3086.2790 3206.02147 3143.381 3250.9545 5976.229 100
Related
i am unable to think of an easy method to do this.
Sample data is :
set.seed(101)
b=sample(seq(as.Date("2010/1/1"), as.Date("2010/1/10"), "days"), 3)
f1=data.frame(a=1:length(b), b=b)
col_names=paste(c('x', 'y'), sort(rep(seq(as.Date("2010/1/1"), as.Date("2010/1/10"), "days"), 2)), sep = '')
set.seed((102))
f2 <- data.frame(matrix(sample(0:5,30, replace = T), ncol = length(col_names), nrow = nrow(f1)))
names(f2)=col_names
f3=data.frame(f1, f2)
or
dput(f3)
structure(list(a = 1:3, b = structure(c(14613, 14610, 14615), class = "Date"),
x2010.01.01 = c(3L, 2L, 4L), y2010.01.01 = c(3L, 0L, 2L),
x2010.01.02 = c(5L, 1L, 5L), y2010.01.02 = c(2L, 5L, 4L),
x2010.01.03 = c(4L, 2L, 3L), y2010.01.03 = c(5L, 4L, 2L),
x2010.01.04 = c(5L, 5L, 5L), y2010.01.04 = c(3L, 3L, 3L),
x2010.01.05 = c(1L, 2L, 0L), y2010.01.05 = c(2L, 2L, 2L),
x2010.01.06 = c(3L, 2L, 4L), y2010.01.06 = c(3L, 0L, 2L),
x2010.01.07 = c(5L, 1L, 5L), y2010.01.07 = c(2L, 5L, 4L),
x2010.01.08 = c(4L, 2L, 3L), y2010.01.08 = c(5L, 4L, 2L),
x2010.01.09 = c(5L, 5L, 5L), y2010.01.09 = c(3L, 3L, 3L),
x2010.01.10 = c(1L, 2L, 0L), y2010.01.10 = c(2L, 2L, 2L)), class = "data.frame", row.names = c(NA,
-3L))
Im trying to create new columns based on comparing b date to column header. im calculating 1 day avg, 3 day avg and so on.
In 1st case the date is 4th jan which imply that 1 day col would be x2010.01.04, 3 day avg would include (x2010.01.04,x2010.01.03,x2010.01.02) and so on. This need to be done for x and y variable both.
Finally op should look like
a b oneday_avg_x oneday_avg_y threeday_avg_x threeday_avg_y
1 1 2010-01-04 5 3 (5+4+5)/3=4.6 3.3
2 2 2010-01-01 2 0 2 0
3 3 2010-01-06 4 2 3 2.3
Let me know if anything is missing.
We can use apply for "x" and "y" values. We remove first leading "x" and "y" from column names, convert it to Date and match it with b value. Return that index along with mean of previous 3 index from the apply call. As apply converts everything to characters, we use type.convert to convert columns to appropriate classes.
x_cols <- grep("^x", names(f3))
y_cols <- grep("^y", names(f3))
out <- f3[1:2]
out[c("oneday_avg_x", "threeday_avg_x")] <- t(apply(f3[c(2, x_cols)], 1, function(x) {
inds <- match(as.Date(x[[1]]), as.Date(sub("^x", "", names(x)), "%Y.%m.%d"))
c(x[inds], mean(as.numeric(x[(inds - 2) : inds]), na.rm = TRUE))
}))
out[c("oneday_avg_y", "threeday_avg_y")] <- t(apply(f3[c(2, y_cols)], 1, function(x) {
inds <- match(as.Date(x[[1]]), as.Date(sub("^y", "", names(x)), "%Y.%m.%d"))
c(x[inds], mean(as.numeric(x[(inds - 2) : inds]), na.rm = TRUE))
}))
out <- type.convert(out)
out
# a b oneday_avg_x threeday_avg_x oneday_avg_y threeday_avg_y
#1 1 2010-01-04 5 4.6667 3 3.3333
#2 2 2010-01-01 2 2.0000 0 0.0000
#3 3 2010-01-06 4 3.0000 2 2.3333
EDIT
A more scalable solution which can handle multiple day averages by using single apply
x_cols <- grep("^x", names(f3))
y_cols <- grep("^y", names(f3))
names(f3)[-(1:2)] <- gsub("\\.", "-", sub(".{1}", "", names(f3)[-(1:2)]))
out <- f3[1:2]
num <- c(1, 3)
new_cols <- c(outer(num, c("x", "y"), function(x, y) paste0(x, "_day_avg_", y)))
out[new_cols] <- t(apply(f3, 1, function(x) {
x_ind <- match(x[[2]], names(x)[x_cols])
x_vals <- sapply(num, function(y)
mean(as.numeric(x[x_cols][max((x_ind - y + 1), 1):x_ind])))
y_ind <- match(x[[2]], names(x)[y_cols])
y_vals <- sapply(num, function(y)
mean(as.numeric(x[y_cols][max((y_ind - y + 1), 1):y_ind])))
c(x_vals, y_vals)
}))
out
# a b 1_day_avg_x 3_day_avg_x 1_day_avg_y 3_day_avg_y
#1 1 2010-01-04 5 4.666667 3 3.333333
#2 2 2010-01-01 2 2.000000 0 0.000000
#3 3 2010-01-06 4 3.000000 2 2.333333
I have 2 data frames with multiple factor columns. One is the base data frame and the other is the final data frame. I want to update the levels of the base data frame using the final data frame.
Consider this example:
base <- data.frame(product=c("Business Call", "Business Transactional",
"Monthly Non-Compounding and Standard Non-Compounding",
"OCR based Call", "Offsale Call", "Offsale Savings",
"Offsale Transactional", "Out of Scope","Personal Call"))
base$product <- as.factor(base$product)
final <- data.frame(product=c("Business Call", "Business Transactional",
"Monthly Standard Non-Compounding", "OCR based Call",
"Offsale Call", "Offsale Savings","Offsale Transactional",
"Out of Scope","Personal Call", "You Money"))
final$product <- as.factor(final$product)
What I would now want is for the final data base to have the same levels as base and remove the levels which do not exist at all like "You Money". Whereas "Monthly Standard Non-Compounding" to be fuzzy matched
Eg:
levels(base$var1) <- "a" "b" "c"
levels(final$var1) <- "Aa" "Bb" "Cc"
Is there a way to overwrite the levels in base data using the final data using some kind of fuzzy match?
Like I want the final levels for both data to be the same. i.e.
levels(base$var1) <- "Aa" "Bb" "Cc"
levels(final$var1) <- "Aa" "Bb" "Cc"
We could build our own fuzzyMatcher.
First, we'll need kinda vectorized agrep function,
agrepv <- function(x, y) all(as.logical(sapply(x, agrep, y)))
on which we build our fuzzyMatcher.
fuzzyMatcher <- function(from, to) {
mc <- mapply(function(y)
which(mapply(function(x) agrepv(y, x), Map(levels, to))),
Map(levels, from))
return(Map(function(x, y) `levels<-`(x, y), base,
Map(levels, from)[mc]))
}
final labels applied on base labels (note, that I've shifted columns to make it a little more sophisticated):
base[] <- fuzzyMatcher(final1, base1)
# X1 X2
# 1 Aa Xx
# 2 Aa Xx
# 3 Aa Yy
# 4 Aa Yy
# 5 Bb Yy
# 6 Bb Zz
# 7 Bb Zz
# 8 Aa Xx
# 9 Cc Xx
# 10 Cc Zz
Update
Based on the new provided data above it'll make sense to use another vectorized agrepv2(), which, used with outer(), enables us to apply agrep on all combinations of the levels of both vectors. Hereafter colSums that equal zero give us non-matching levels and which.max the matching levels of the target data frame final. We can use these two resulting vectors on the one hand to delete unused rows of final, on the other hand to subset the desired levels of the base data frame in order to rebuild the factor column.
# add to mimic other columns in data frame
base$x <- seq(nrow(base))
final$x <- seq(nrow(final))
# some abbrevations for convenience
p1 <- levels(base$product)
p2 <- levels(final$product)
# agrep
AGREPV2 <- Vectorize(function(x, y, ...) agrep(p2[x], p1[y])) # new vectorized agrep
out <- t(outer(seq(p2), seq(p1), agrepv2, max.distance=0.9)) # apply `agrepv2`
del.col <- grep(0, colSums(apply(out, 2, lengths))) # find negative matches
lvl <- unlist(apply(out, 2, which.max)) # find positive matches
lvl <- as.character(p2[lvl]) # get the labels
# delete "non-existing" rows and re-generate factor with new labels
transform(final[-del.col, ], product=factor(product, labels=lvl))
# product x
# 1 Business Call 1
# 2 Business Transactional 2
# 4 OCR based Call 4
# 5 Offsale Call 5
# 6 Offsale Savings 6
# 7 Offsale Transactional 7
# 8 Out of Scope 8
# 9 Personal Call 9
Data
base1 <- structure(list(X1 = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L,
3L, 3L), .Label = c("a", "b", "c"), class = "factor"), X2 = structure(c(1L,
1L, 2L, 2L, 2L, 3L, 3L, 1L, 1L, 3L), .Label = c("x", "y", "z"
), class = "factor")), row.names = c(NA, -10L), class = "data.frame")
final1 <- structure(list(X1 = structure(c(1L, 3L, 1L, 1L, 2L, 3L, 2L, 1L,
2L, 2L, 3L, 3L, 2L, 2L, 2L), .Label = c("Xx", "Yy", "Zz"), class = "factor"),
X2 = structure(c(2L, 1L, 1L, 2L, 2L, 3L, 3L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 3L), .Label = c("Aa", "Bb", "Cc"), class = "factor")), row.names = c(NA,
-15L), class = "data.frame")
looking for some help with data manipulation in R. I have data in the following format;
ID L1 L2 L3
1 BBCBCACCBCB CBCBBBB BEBBBAAB
2 BBCBCCCCBCB CBCCCBC BBAACCCB
3 BBCBCACCBCB CBCBBBB BEBBBAAB
4 BBCBCACCBCB CBCBBBB BEBBBAAB
5 BBCBACBCCCB BBCCCBC BBCBAAAAB
6 BBCBBCCBBCB BBCBCEB BBBBCAACB
7 BBCBBCCBBCB BBCBCEB BBBBCAACB
8
9 BBCBCACCBCB CBCBBBB BEBBBAAB
10 BBCBBCCBBCB BBCBCEB BBBBCAACB
11 BBCBBCCBBCB BBCBCEB BBBBCAACB
The values in each column will be strings of varying length. I want an R function that for each column above, will
1) generate a dynamic number of columns based on the maximum length of any string in the column e.g. L1 max length = 11, therefore 11 new columns each labelled L1_1:L1_11
2) then split the strings into triplets, e.g.
ID L1 L2 L3 L1_1 L1_2 L1_3 L1_4 L1_5 L1_6 L1_7 L1_8 L1_9
1 BBCBCACCBCB CBCBBBB BEBBBAAB BBC BCB CBC BCA CAC ACC CCB CBC BCB
3) perform a calculation on this triplet i.e. (number of 'a' * 1) + (number of 'b' * 3) + (number of 'c'*7) in the triplet.
4) return the value of this calculation in the new column.
I have found that the code suggested does exactly what I need when run for columns L1, L2 but does not work for L3. The error I receive is 'Error in as.data.frame.matrix(passed.args[[i]], stringsAsFactors = st : missing value where TRUE/FALSE needed'
Any ideas?
Thanks very much.
EDIT
dput(df):
structure(list(ID = 1:11, L1 = structure(c(4L, 5L, 4L, 4L, 2L, 3L, 3L, 1L, 4L, 3L, 3L), .Label = c("", "BBCBACBCCCB","BBCBBCCBBCB","BBCBCACCBCB", "BBCBCCCCBCB"), class = "factor"), L2 = structure(c(4L, 5L, 4L, 4L, 3L, 2L, 2L, 1L, 4L, 2L, 2L), .Label = c("","BBCBCEB","BBCCCBC", "CBCBBBB", "CBCCCBC"), class = "factor"), L3 = structure(c(5L,2L, 5L, 5L, 4L, 3L, 3L, 1L, 5L, 3L, 3L), .Label = c("", "BBAACCCB", "BBBBCAACB", "BBCBAAAAB", "BEBBBAAB"), class = "factor")), .Names = c("ID", "L1", "L2", "L3"), class = "data.frame", row.names = c(NA,-11L))
structure(list(ID = 1:11, L1 = structure(c(4L, 5L, 4L, 4L, 2L, 3L, 3L, 1L, 4L, 3L, 3L), .Label = c("", "BBCBACBCCCB","BBCBBCCBBCB","BBCBCACCBCB", "BBCBCCCCBCB"), class = "factor"), L2 = structure(c(4L, 5L, 4L, 4L, 3L, 2L, 2L, 1L, 4L, 2L, 2L), .Label = c("","BBCBCEB","BBCCCBC", "CBCBBBB", "CBCCCBC"), class = "factor"), L3 = structure(c(5L,2L, 5L, 5L, 4L, 3L, 3L, 1L, 5L, 3L, 3L), .Label = c("", "BBAACCCB", "BBBBCAACB", "BBCBAAAAB", "BEBBBAAB"), class = "factor")), .Names = c("ID", "L1", "L2", "L3"), class = "data.frame", row.names = c(NA,-11L))
#DATA
df = structure(list(ID = 1:4, L1 = c("abbbcc", "aabacd", "abbda",
"bbad")), .Names = c("ID", "L1"), class = "data.frame", row.names = c(NA,
-4L))
#Go through the strings and split into subgroups of 3 characters.
#Put the substrings in a list
temp = lapply(df$L1, function(x) sapply(3:nchar(x), function(i) substr(x, i-2, i)))
#Obtain the length of the subgroup with the most triplets
temp_l = max(lengths(temp))
#Subset the subgroups from 1 to temp_l so that remianing values are NA
cbind(df, setNames(data.frame(do.call(rbind, lapply(temp, function(a)
a[1:temp_l]))), nm = paste0("L1_",1:temp_l)))
# ID L1 L1_1 L1_2 L1_3 L1_4
#1 1 abbbcc abb bbb bbc bcc
#2 2 aabacd aab aba bac acd
#3 3 abbda abb bbd bda <NA>
#4 4 bbad bba bad <NA> <NA>
If you want calculation based on triplets, run the following before doing the cbind step
temp_L1 = lapply(df$L1, function(x) sapply(3:nchar(x), function(i) substr(x, i-2, i)))
temp_L1_length = max(lengths(temp_L1))
temp_L1 = lapply(temp_L1, function(x)
sapply(x, function(y){
num_a = unlist(gregexpr(pattern = "a", text = y))
num_a = sum(num_a > 0) #length of positive match
num_b = unlist(gregexpr(pattern = "b", text = y))
num_b = sum(num_b > 0)
num_c = unlist(gregexpr(pattern = "c", text = y))
num_c = sum(num_c > 0)
num_a * 1 + num_b * 3 + num_c * 7
})
)
temp_L1 = setNames(data.frame(do.call(rbind, lapply(temp_L1, function(a)
a[1:temp_L1_length]))), nm = paste0("L1_",1:temp_L1_length))
#REPEAT FOR L2, L3, ...
cbind(df, temp_L1) #Run cbind(df, temp_L1, temp_L2, ...)
# ID L1 L1_1 L1_2 L1_3 L1_4
#1 1 abbbcc 7 9 13 17
#2 2 aabacd 5 5 11 8
#3 3 abbda 7 6 4 NA
#4 4 bbad 7 4 NA NA
UPDATE
You could create a function and use it like shown below
#FUNCTION
foo = function(data, column){
temp_L1 = lapply(as.character(data[[column]]), function(x) sapply(3:nchar(x), function(i) substr(x, i-2, i)))
temp_L1_length = max(lengths(temp_L1))
temp_L1 = lapply(temp_L1, function(x)
sapply(x, function(y){
num_a = unlist(gregexpr(pattern = "a", text = y, ignore.case = TRUE))
num_a = sum(num_a > 0) #length of positive match
num_b = unlist(gregexpr(pattern = "b", text = y, ignore.case = TRUE))
num_b = sum(num_b > 0)
num_c = unlist(gregexpr(pattern = "c", text = y, ignore.case = TRUE))
num_c = sum(num_c > 0)
num_a * 1 + num_b * 3 + num_c * 7
})
)
temp_L1 = setNames(data.frame(do.call(rbind, lapply(temp_L1, function(a)
a[1:temp_L1_length]))), nm = paste0(column,"_",1:temp_L1_length))
return(temp_L1)
}
#USING ON NEW DATA
cbind(df, do.call(cbind, lapply(colnames(df)[-1], function(x) foo(data = df, column = x))))
If you want to use tidyverse verbs
library(tidyverse)
df1 <- df %>%
mutate(L2=L1) %>% # copies L1
nest(L2) %>% # nest L1
mutate(data=map(data,~sapply(1:(nchar(.x)-2), function(y) substr(.x, y, y+2)))) %>% # makes triplets
unnest(data) %>% # unnest triplets
group_by(ID) %>% # perform next operations group wise
mutate(rn=letters[row_number()]) %>% # make future column names
spread(rn,data) # spread long format into wide format (columns)
ID L1 a b c d
1 1 abbbcc abb bbb bbc bcc
2 2 aabacd aab aba bac acd
3 3 abbda abb bbd bda <NA>
4 4 bbad bba bad <NA> <NA>
This question already has answers here:
Return most frequent string value for each group [duplicate]
(3 answers)
Fastest way of determining most frequent factor in a grouped data frame in dplyr
(4 answers)
Closed 5 years ago.
I have a dataset as below:
Group Class
A 1
A 2
A 1
A 1
B 2
B 2
B 2
B 1
B 3
B 1
C 1
C 1
C 1
C 2
C 3
I want to aggregate the table by the ‘Group’ column and the value on the ‘Class’ column would be the Class with maximum count. For instance, for Group A, 1 appears three times, so the value for Class is 1. Similarly, for Group 2, 2 appears three times, so the value for Class is 2. The result table should be the following:
Group Class
A 1
B 2
C 1
I am new to R programming and would appreciate your help in solving this problem. Thanks!
You could also do this without using aggregate, so using table and max.col instead:
tb <- table(df$Group, df$Class)
data.frame("Group"=rownames(tb), "CLass"=max.col(tb))
# Group CLass
#1 A 1
#2 B 2
#3 C 1
Which seems to be faster:
library(microbenchmark)
# by Ronak Shah in comments
f1 <- function(df) aggregate(Class~Group, df, function(x) which.max(table(x)))
# this answer
f2 <- function(df) {tb <- table(df$Group, df$Class);
data.frame("Group"=rownames(tb), "CLass"=max.col(tb));}
all(f1(df)==f2(df))
# [1] TRUE
microbenchmark(f1(df), f2(df))
# Unit: microseconds
# expr min lq mean median uq max neval
# f1(df) 800.153 838.9130 923.6484 870.0115 918.988 1981.901 100
# f2(df) 298.367 319.0995 353.4915 338.6305 380.246 599.439 100
df <- structure(list(Group = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L), .Label = c("A", "B", "C"), class = "factor"),
Class = c(1L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 3L, 1L, 1L, 1L,
1L, 2L, 3L)), .Names = c("Group", "Class"), class = "data.frame", row.names = c(NA,
-15L))
I'm trying to get the data from column one that matches with column 2 but only on the "B" values. Need to somehow make the true values a list.
Need this to repeat for 50,000 rows. Around 37,000 of them are true.
I'm incredibly new to this so any help would be nice.
Data <- data.frame(
X = sample(1:10),
Y = sample(c("B", "W"), 10, replace = TRUE)
)
Count <- 1
If(data[count,2] == "B") {
List <- list(data[count,1]
Count <- count + 1
#I'm not sure what to use to repeat I just put
Repeat
} else {
Count <- count + 1
Repeat
}
End result should be a list() of only column one data.
In this if rows 1-5 had "B" I want the column one numbers from that.
Not sure if I understood correctly what you're looking for, but from the comments I would assume that this might help:
setNames(data.frame(Data[1][Data[2]=="B"]), "selected")
# selected
#1 2
#2 5
#3 7
#4 6
No loop needed.
data
Data <- structure(list(X = c(10L, 4L, 9L, 8L, 3L, 2L, 5L, 1L, 7L, 6L),
Y = structure(c(2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L),
.Label = c("B", "W"), class = "factor")),
.Names = c("X", "Y"), row.names = c(NA, -10L),
class = "data.frame")