Related
i am unable to think of an easy method to do this.
Sample data is :
set.seed(101)
b=sample(seq(as.Date("2010/1/1"), as.Date("2010/1/10"), "days"), 3)
f1=data.frame(a=1:length(b), b=b)
col_names=paste(c('x', 'y'), sort(rep(seq(as.Date("2010/1/1"), as.Date("2010/1/10"), "days"), 2)), sep = '')
set.seed((102))
f2 <- data.frame(matrix(sample(0:5,30, replace = T), ncol = length(col_names), nrow = nrow(f1)))
names(f2)=col_names
f3=data.frame(f1, f2)
or
dput(f3)
structure(list(a = 1:3, b = structure(c(14613, 14610, 14615), class = "Date"),
x2010.01.01 = c(3L, 2L, 4L), y2010.01.01 = c(3L, 0L, 2L),
x2010.01.02 = c(5L, 1L, 5L), y2010.01.02 = c(2L, 5L, 4L),
x2010.01.03 = c(4L, 2L, 3L), y2010.01.03 = c(5L, 4L, 2L),
x2010.01.04 = c(5L, 5L, 5L), y2010.01.04 = c(3L, 3L, 3L),
x2010.01.05 = c(1L, 2L, 0L), y2010.01.05 = c(2L, 2L, 2L),
x2010.01.06 = c(3L, 2L, 4L), y2010.01.06 = c(3L, 0L, 2L),
x2010.01.07 = c(5L, 1L, 5L), y2010.01.07 = c(2L, 5L, 4L),
x2010.01.08 = c(4L, 2L, 3L), y2010.01.08 = c(5L, 4L, 2L),
x2010.01.09 = c(5L, 5L, 5L), y2010.01.09 = c(3L, 3L, 3L),
x2010.01.10 = c(1L, 2L, 0L), y2010.01.10 = c(2L, 2L, 2L)), class = "data.frame", row.names = c(NA,
-3L))
Im trying to create new columns based on comparing b date to column header. im calculating 1 day avg, 3 day avg and so on.
In 1st case the date is 4th jan which imply that 1 day col would be x2010.01.04, 3 day avg would include (x2010.01.04,x2010.01.03,x2010.01.02) and so on. This need to be done for x and y variable both.
Finally op should look like
a b oneday_avg_x oneday_avg_y threeday_avg_x threeday_avg_y
1 1 2010-01-04 5 3 (5+4+5)/3=4.6 3.3
2 2 2010-01-01 2 0 2 0
3 3 2010-01-06 4 2 3 2.3
Let me know if anything is missing.
We can use apply for "x" and "y" values. We remove first leading "x" and "y" from column names, convert it to Date and match it with b value. Return that index along with mean of previous 3 index from the apply call. As apply converts everything to characters, we use type.convert to convert columns to appropriate classes.
x_cols <- grep("^x", names(f3))
y_cols <- grep("^y", names(f3))
out <- f3[1:2]
out[c("oneday_avg_x", "threeday_avg_x")] <- t(apply(f3[c(2, x_cols)], 1, function(x) {
inds <- match(as.Date(x[[1]]), as.Date(sub("^x", "", names(x)), "%Y.%m.%d"))
c(x[inds], mean(as.numeric(x[(inds - 2) : inds]), na.rm = TRUE))
}))
out[c("oneday_avg_y", "threeday_avg_y")] <- t(apply(f3[c(2, y_cols)], 1, function(x) {
inds <- match(as.Date(x[[1]]), as.Date(sub("^y", "", names(x)), "%Y.%m.%d"))
c(x[inds], mean(as.numeric(x[(inds - 2) : inds]), na.rm = TRUE))
}))
out <- type.convert(out)
out
# a b oneday_avg_x threeday_avg_x oneday_avg_y threeday_avg_y
#1 1 2010-01-04 5 4.6667 3 3.3333
#2 2 2010-01-01 2 2.0000 0 0.0000
#3 3 2010-01-06 4 3.0000 2 2.3333
EDIT
A more scalable solution which can handle multiple day averages by using single apply
x_cols <- grep("^x", names(f3))
y_cols <- grep("^y", names(f3))
names(f3)[-(1:2)] <- gsub("\\.", "-", sub(".{1}", "", names(f3)[-(1:2)]))
out <- f3[1:2]
num <- c(1, 3)
new_cols <- c(outer(num, c("x", "y"), function(x, y) paste0(x, "_day_avg_", y)))
out[new_cols] <- t(apply(f3, 1, function(x) {
x_ind <- match(x[[2]], names(x)[x_cols])
x_vals <- sapply(num, function(y)
mean(as.numeric(x[x_cols][max((x_ind - y + 1), 1):x_ind])))
y_ind <- match(x[[2]], names(x)[y_cols])
y_vals <- sapply(num, function(y)
mean(as.numeric(x[y_cols][max((y_ind - y + 1), 1):y_ind])))
c(x_vals, y_vals)
}))
out
# a b 1_day_avg_x 3_day_avg_x 1_day_avg_y 3_day_avg_y
#1 1 2010-01-04 5 4.666667 3 3.333333
#2 2 2010-01-01 2 2.000000 0 0.000000
#3 3 2010-01-06 4 3.000000 2 2.333333
looking for some help with data manipulation in R. I have data in the following format;
ID L1 L2 L3
1 BBCBCACCBCB CBCBBBB BEBBBAAB
2 BBCBCCCCBCB CBCCCBC BBAACCCB
3 BBCBCACCBCB CBCBBBB BEBBBAAB
4 BBCBCACCBCB CBCBBBB BEBBBAAB
5 BBCBACBCCCB BBCCCBC BBCBAAAAB
6 BBCBBCCBBCB BBCBCEB BBBBCAACB
7 BBCBBCCBBCB BBCBCEB BBBBCAACB
8
9 BBCBCACCBCB CBCBBBB BEBBBAAB
10 BBCBBCCBBCB BBCBCEB BBBBCAACB
11 BBCBBCCBBCB BBCBCEB BBBBCAACB
The values in each column will be strings of varying length. I want an R function that for each column above, will
1) generate a dynamic number of columns based on the maximum length of any string in the column e.g. L1 max length = 11, therefore 11 new columns each labelled L1_1:L1_11
2) then split the strings into triplets, e.g.
ID L1 L2 L3 L1_1 L1_2 L1_3 L1_4 L1_5 L1_6 L1_7 L1_8 L1_9
1 BBCBCACCBCB CBCBBBB BEBBBAAB BBC BCB CBC BCA CAC ACC CCB CBC BCB
3) perform a calculation on this triplet i.e. (number of 'a' * 1) + (number of 'b' * 3) + (number of 'c'*7) in the triplet.
4) return the value of this calculation in the new column.
I have found that the code suggested does exactly what I need when run for columns L1, L2 but does not work for L3. The error I receive is 'Error in as.data.frame.matrix(passed.args[[i]], stringsAsFactors = st : missing value where TRUE/FALSE needed'
Any ideas?
Thanks very much.
EDIT
dput(df):
structure(list(ID = 1:11, L1 = structure(c(4L, 5L, 4L, 4L, 2L, 3L, 3L, 1L, 4L, 3L, 3L), .Label = c("", "BBCBACBCCCB","BBCBBCCBBCB","BBCBCACCBCB", "BBCBCCCCBCB"), class = "factor"), L2 = structure(c(4L, 5L, 4L, 4L, 3L, 2L, 2L, 1L, 4L, 2L, 2L), .Label = c("","BBCBCEB","BBCCCBC", "CBCBBBB", "CBCCCBC"), class = "factor"), L3 = structure(c(5L,2L, 5L, 5L, 4L, 3L, 3L, 1L, 5L, 3L, 3L), .Label = c("", "BBAACCCB", "BBBBCAACB", "BBCBAAAAB", "BEBBBAAB"), class = "factor")), .Names = c("ID", "L1", "L2", "L3"), class = "data.frame", row.names = c(NA,-11L))
structure(list(ID = 1:11, L1 = structure(c(4L, 5L, 4L, 4L, 2L, 3L, 3L, 1L, 4L, 3L, 3L), .Label = c("", "BBCBACBCCCB","BBCBBCCBBCB","BBCBCACCBCB", "BBCBCCCCBCB"), class = "factor"), L2 = structure(c(4L, 5L, 4L, 4L, 3L, 2L, 2L, 1L, 4L, 2L, 2L), .Label = c("","BBCBCEB","BBCCCBC", "CBCBBBB", "CBCCCBC"), class = "factor"), L3 = structure(c(5L,2L, 5L, 5L, 4L, 3L, 3L, 1L, 5L, 3L, 3L), .Label = c("", "BBAACCCB", "BBBBCAACB", "BBCBAAAAB", "BEBBBAAB"), class = "factor")), .Names = c("ID", "L1", "L2", "L3"), class = "data.frame", row.names = c(NA,-11L))
#DATA
df = structure(list(ID = 1:4, L1 = c("abbbcc", "aabacd", "abbda",
"bbad")), .Names = c("ID", "L1"), class = "data.frame", row.names = c(NA,
-4L))
#Go through the strings and split into subgroups of 3 characters.
#Put the substrings in a list
temp = lapply(df$L1, function(x) sapply(3:nchar(x), function(i) substr(x, i-2, i)))
#Obtain the length of the subgroup with the most triplets
temp_l = max(lengths(temp))
#Subset the subgroups from 1 to temp_l so that remianing values are NA
cbind(df, setNames(data.frame(do.call(rbind, lapply(temp, function(a)
a[1:temp_l]))), nm = paste0("L1_",1:temp_l)))
# ID L1 L1_1 L1_2 L1_3 L1_4
#1 1 abbbcc abb bbb bbc bcc
#2 2 aabacd aab aba bac acd
#3 3 abbda abb bbd bda <NA>
#4 4 bbad bba bad <NA> <NA>
If you want calculation based on triplets, run the following before doing the cbind step
temp_L1 = lapply(df$L1, function(x) sapply(3:nchar(x), function(i) substr(x, i-2, i)))
temp_L1_length = max(lengths(temp_L1))
temp_L1 = lapply(temp_L1, function(x)
sapply(x, function(y){
num_a = unlist(gregexpr(pattern = "a", text = y))
num_a = sum(num_a > 0) #length of positive match
num_b = unlist(gregexpr(pattern = "b", text = y))
num_b = sum(num_b > 0)
num_c = unlist(gregexpr(pattern = "c", text = y))
num_c = sum(num_c > 0)
num_a * 1 + num_b * 3 + num_c * 7
})
)
temp_L1 = setNames(data.frame(do.call(rbind, lapply(temp_L1, function(a)
a[1:temp_L1_length]))), nm = paste0("L1_",1:temp_L1_length))
#REPEAT FOR L2, L3, ...
cbind(df, temp_L1) #Run cbind(df, temp_L1, temp_L2, ...)
# ID L1 L1_1 L1_2 L1_3 L1_4
#1 1 abbbcc 7 9 13 17
#2 2 aabacd 5 5 11 8
#3 3 abbda 7 6 4 NA
#4 4 bbad 7 4 NA NA
UPDATE
You could create a function and use it like shown below
#FUNCTION
foo = function(data, column){
temp_L1 = lapply(as.character(data[[column]]), function(x) sapply(3:nchar(x), function(i) substr(x, i-2, i)))
temp_L1_length = max(lengths(temp_L1))
temp_L1 = lapply(temp_L1, function(x)
sapply(x, function(y){
num_a = unlist(gregexpr(pattern = "a", text = y, ignore.case = TRUE))
num_a = sum(num_a > 0) #length of positive match
num_b = unlist(gregexpr(pattern = "b", text = y, ignore.case = TRUE))
num_b = sum(num_b > 0)
num_c = unlist(gregexpr(pattern = "c", text = y, ignore.case = TRUE))
num_c = sum(num_c > 0)
num_a * 1 + num_b * 3 + num_c * 7
})
)
temp_L1 = setNames(data.frame(do.call(rbind, lapply(temp_L1, function(a)
a[1:temp_L1_length]))), nm = paste0(column,"_",1:temp_L1_length))
return(temp_L1)
}
#USING ON NEW DATA
cbind(df, do.call(cbind, lapply(colnames(df)[-1], function(x) foo(data = df, column = x))))
If you want to use tidyverse verbs
library(tidyverse)
df1 <- df %>%
mutate(L2=L1) %>% # copies L1
nest(L2) %>% # nest L1
mutate(data=map(data,~sapply(1:(nchar(.x)-2), function(y) substr(.x, y, y+2)))) %>% # makes triplets
unnest(data) %>% # unnest triplets
group_by(ID) %>% # perform next operations group wise
mutate(rn=letters[row_number()]) %>% # make future column names
spread(rn,data) # spread long format into wide format (columns)
ID L1 a b c d
1 1 abbbcc abb bbb bbc bcc
2 2 aabacd aab aba bac acd
3 3 abbda abb bbd bda <NA>
4 4 bbad bba bad <NA> <NA>
I have a data with three columns like
Inputdf<-structure(list(df1 = structure(c(4L, 5L, 2L, 1L, 3L), .Label = c("P61160,P61158,O15143,O15144,O15145,P59998,O15511",
"P78537,Q6QNY1,Q6QNY0", "Q06323,Q9UL46", "Q92793,Q09472,Q9Y6Q9,Q92831",
"Q92828,Q13227,O15379,O75376,O60907,Q9BZK7"), class = "factor"),
df2 = structure(c(3L, 2L, 5L, 4L, 1L), .Label = c("", "P61158,O15143,O15144",
"Q06323,Q9UL46", "Q6QNY0", "Q92828"), class = "factor"),
df3 = structure(c(5L, 4L, 3L, 2L, 1L), .Label = c("", "O15511",
"Q06323,Q9UL46", "Q6QNY0", "Q92793,Q09472"), class = "factor")), .Names = c("df1",
"df2", "df3"), class = "data.frame", row.names = c(NA, -5L))
I am trying to find similar strings in this data for example
in df1, I have the first row I have Q92793,Q09472,Q9Y6Q9,Q92831
then I look at df2 and df3 and see if any of these members are in there then in this example, I make the following data
df1 df2 df3 Numberdf1 df2 df3
1 0 1 4 0 Q92793,Q09472
df1 1 means the first row of df1
df2 0 means it did not have any similarity
df3 1, means the first row of df3 has similarity with df1 row 1
Numberdf1, it is the count of strings separated by a ,which is 4
df2 is 0 because there was not any similar string accords df2
df3 is Q92793,Q09472 which paste the string which were similar in here
a desire output looks like below
out<- structure(list(df1 = 1:5, df2 = c(0L, 3L, 4L, 2L, 1L), df3 = c(1L,
0L, 2L, 4L, 3L), Numberdf1 = c(4L, 6L, 2L, 7L, 2L), df2.1 = structure(c(1L,
5L, 4L, 2L, 3L), .Label = c("0", "P61158,O15143,O15144", "Q06323,Q9UL46",
"Q6QNY0", "Q92828"), class = "factor"), df3.1 = structure(c(5L,
1L, 4L, 2L, 3L), .Label = c("0", "O15511", "Q06323,Q9UL46", "Q6QNY0",
"Q92793,Q09472"), class = "factor")), .Names = c("df1", "df2",
"df3", "Numberdf1", "df2.1", "df3.1"), class = "data.frame", row.names = c(NA,
-5L))
The below function does not work , for example, use this data as input
Inputdf1<- structure(list(df1 = structure(c(2L, 3L, 1L), .Label = c("Q06323,Q9UL46",
"Q92793,Q09472,Q9Y6Q9,Q92831", "Q92828,Q13227,O15379,O75376,O60907,Q9BZK7"
), class = "factor"), df2 = structure(1:3, .Label = c("P25788,P25789",
"Q92828, O60907, O75376", "Q9UL46, Q06323"), class = "factor"),
df3 = structure(c(2L, 1L, 3L), .Label = c("Q92831, Q92793, Q09472",
"Q9BZK7, Q92828, O75376, O60907", "Q9UL46, Q06323"), class = "factor")), .Names = c("df1",
"df2", "df3"), class = "data.frame", row.names = c(NA, -3L))
This works for your example:
# First convert factors to strings to lists
Inputdf[] = lapply(Inputdf, as.character)
Inputdf[] = lapply(Inputdf, function(col) sapply(col, function(x) unlist(strsplit(x,','))))
not.empty = function(x) length(x) > 0
out = data.frame()
for (r in 1:nrow(Inputdf)) {
df2.intersect = lapply(Inputdf$df2, intersect, Inputdf$df1[[r]])
df3.intersect = lapply(Inputdf$df3, intersect, Inputdf$df1[[r]])
out[r, 'df1'] = r
out[r, 'df2'] = Position(not.empty, df2.intersect, nomatch=0)
out[r, 'df3'] = Position(not.empty, df3.intersect, nomatch=0)
out[r, 'Numberdf1'] = length(Inputdf$df1[[r]])
out[r, 'df2.1'] = paste(Find(not.empty, df2.intersect, nomatch=0), collapse=',')
out[r, 'df3.1'] = paste(Find(not.empty, df3.intersect, nomatch=0), collapse=',')
}
out
# df1 df2 df3 Numberdf1 df2.1 df3.1
# 1 1 0 1 4 0 Q92793,Q09472
# 2 2 3 0 6 Q92828 0
# 3 3 4 2 3 Q6QNY0 Q6QNY0
# 4 4 2 4 7 P61158,O15143,O15144 O15511
# 5 5 1 3 2 Q06323,Q9UL46 Q06323,Q9UL46
Note: Find and Position identify the first match only. If there are potentially multiple matches, use which.
EDIT
Version accounting for multiple matches
Inputdf[] = lapply(Inputdf, as.character)
Inputdf[] = lapply(Inputdf, function(col) sapply(col, function(x) unlist(strsplit(x,',\\s*'))))
not.empty = function(x) length(x) > 0
out = data.frame()
for (r in 1:nrow(Inputdf)) {
df2.intersect = lapply(Inputdf$df2, intersect, Inputdf$df1[[r]])
df3.intersect = lapply(Inputdf$df3, intersect, Inputdf$df1[[r]])
out[r, 'df1'] = r
out[r, 'df2'] = paste(which(sapply(df2.intersect, not.empty)), collapse=',')
out[r, 'df3'] = paste(which(sapply(df3.intersect, not.empty)), collapse=',')
out[r, 'Numberdf1'] = length(Inputdf$df1[[r]])
out[r, 'df2.1'] = paste(unique(unlist(df2.intersect)), collapse=',')
out[r, 'df3.1'] = paste(unique(unlist(df3.intersect)), collapse=',')
}
out[out==""] = "0"
Given df as follows:
# group value
# 1 A 8
# 2 A 1
# 3 A 7
# 4 B 3
# 5 B 2
# 6 B 6
# 7 C 4
# 8 C 5
df <- structure(list(group = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L,
3L), .Label = c("A", "B", "C"), class = "factor"), value = c(8L,
1L, 7L, 3L, 2L, 6L, 4L, 5L)), .Names = c("group", "value"), class = "data.frame", row.names = c(NA,
-8L))
And a vector of indices (possibly with NA):
inds <- c(2,1,NA)
How we can get the nth element of column value per group, preferably in base R?
For example, based on inds, we want the second element of value in group A, first element in group B, NA in group C. So the result would be:
#[1] 1 3 NA
Here is a solution with mapply and split:
mapply("[", with(df, split(value, group)), inds)
which returns a named vector
A B C
1 3 NA
with(df, split(value, group)) splits the data frame by group and returns a list of data frames. mapply takes that list and "inds" and applies the subsetting function "[" to each pairs of arguments.
Using levels and sapply you could do:
DF <- structure(list(group = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L,
3L), .Label = c("A", "B", "C"), class = "factor"), value = c(8L,
1L, 7L, 3L, 2L, 6L, 4L, 5L)), .Names = c("group", "value"), class = "data.frame", row.names = c(NA,
-8L))
inds <- c(2,1,NA)
lvls = levels(DF$group)
groupInds = sapply(1:length(lvls),function(x) DF$value[DF$group==lvls[x]][inds[x]] )
groupInds
#[1] 1 3 NA
Using again mapply (but not nearly as elegant as IMO's answer):
mapply(function(x, y) subset(df, group == x, value)[y,] ,levels(df$group), inds)
I know you said preferably in base R, but just for the record, here is a data.table way
setDT(df)[, .SD[inds[.GRP], value], by=group][,V1]
#[1] 1 3 NA
I just did come up with another solution:
diag(aggregate(value~group, df, function(x) x[inds])[,-1])
#[1] 1 3 NA
Benchmarking
library(microbenchmark)
library(data.table)
df <- structure(list(group = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L,
3L), .Label = c("A", "B", "C"), class = "factor"), value = c(8L,
1L, 7L, 3L, 2L, 6L, 4L, 5L)), .Names = c("group", "value"), class = "data.frame", row.names = c(NA,
-8L))
inds <- c(2,1,NA)
f_Imo <- function(df) as.vector(mapply("[", with(df, split(value, group)), inds))
f_Osssan <- function(df) {lvls = levels(df$group);sapply(1:length(lvls),function(x) df$value[df$group==lvls[x]][inds[x]])}
f_User2321 <- function(df) unlist(mapply(function(x, y) subset(df, group == x, value)[y,] ,levels(df$group), inds))
f_dww <- function(df) setDT(df)[, .SD[inds[.GRP], value], by=group][,V1]
f_m0h3n <- function(df) diag(aggregate(value~group, df, function(x) x[inds])[,-1])
all.equal(f_Imo(df), f_Osssan(df), f_User2321(df), f_dww(df), f_m0h3n(df))
# [1] TRUE
microbenchmark(f_Imo(df), f_Osssan(df), f_m0h3n(df), f_User2321(df), f_dww(df))
# Unit: microseconds
# expr min lq mean median uq max neval
# f_Imo(df) 71.004 85.1180 91.52996 91.748 96.8810 121.048 100
# f_Osssan(df) 252.788 276.5265 318.70529 287.648 301.5495 2651.492 100
# f_m0h3n(df) 1422.627 1555.4365 1643.47184 1618.740 1670.7095 4729.827 100
# f_User2321(df) 2889.738 3000.3055 3148.44916 3037.945 3118.7860 6013.442 100
# f_dww(df) 2960.740 3086.2790 3206.02147 3143.381 3250.9545 5976.229 100
I am studying this webpage, and cannot figure out how to rename freq to something else, say number of times imbibed
Here is dput
structure(list(name = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L), .Label = c("Bill", "Llib"), class = "factor"), drink = structure(c(2L,
3L, 1L, 4L, 2L, 3L, 1L, 4L), .Label = c("cocoa", "coffee", "tea",
"water"), class = "factor"), cost = 1:8), .Names = c("name",
"drink", "cost"), row.names = c(NA, -8L), class = "data.frame")
And this is working code with output. Again, I'd like to rename the freq column. Thanks!
library(plyr)
bevs$cost <- as.integer(bevs$cost)
count(bevs, "name")
Output
name freq
1 Bill 4
2 Llib 4
Are you trying to do this?
counts <- count(bevs, "name")
names(counts) <- c("name", "number of times imbibed")
counts
The count() function returns a data.frame. Just rename it like any other data.frame:
counts <- count(bevs, "name")
names(counts)[which(names(counts) == "freq")] <- "number of times imbibed"
print(counts)
# name number of times imbibed
# 1 Bill 4
# 2 Llib 4