If else ladder not working in R - r

I have this in my dataframe after reading and rearranging multiple csv files. Basically I want an if else ladder to refer to the ID column and if it matches a number from the list of concatenates then place a word in a new "group" column
# of int. int. not.int. ID
1 50 218.41 372.16 1
3 33 134.94 158.17 3
I then made these concatenates to refer to.
veh = as.character(c('1', '5'))
thc1 = as.character(c('2', '6'))
thc2 = as.character(c('3', '7'))
thc3 = as.character(c('4', '8'))
Then I made an if else ladder to list the corresponding values.
social.dat$group = if (social.dat$ID == veh) {
social.dat$group == "veh"
} else if (social.dat$group == thc1) {
social.dat$group == "thc1"
} else if (social.dat$group == thc2) {
social.dat$group == "thc2"
} else {
social.dat$group == "thc3"
}
However, I then get this warning message.
Warning message:
In if (social.dat$ID == veh) { :
the condition has length > 1 and only the first element will be used
I have looked up this warning message in multiple different variations and have not found anything that really helped. Any help for this would be much appreciated or and alternate options would be good as well. I apologize in advance if there was a solution on stack already if I missed it.
EDIT:
I tried using
social.dat$group = ifelse(social.dat$ID == veh, "veh", "thc")
social.dat$group = ifelse(social.dat$ID == thc, "thc", "veh")
but it changed the output of the dataframe after each line.
Here is the full code i am using to rearrange the csv files and get the dataframe that I first mentioned above.
#calls packages
library(tidyr)
library( plyr )
library(reshape2)
#make sure to change your working directory to where the files are kept
setwd("C:/Users/callej03/Desktop/test")
wd = "C:/Users/callej03/Desktop/test"
files = list.files(path=wd, pattern="*.csv", full.names=TRUE,
recursive=FALSE)
################################################################
#this function creates a list of the number of interactions for each file in
the folder
lap.list = lapply(files, function(x) {
dat = read.csv(x, header= TRUE)
dat = dat[-c(1),]
dat = as.data.frame(dat)
dat = separate(data = dat, col = dat, into = c("lap", "duration"), sep = "\\
")
dat$count = 1:nrow(dat)
y = dat$count
i= y%%2==0
dat$interacting = i
int = dat[which(dat$interacting == TRUE),]
interactions = sum(int$interacting)
})
#########################################################################
#this changes the row name to the name of the file - i.e. the rat ID
lap.list = as.data.frame(lap.list)
lap.list = t(lap.list)
colnames(lap.list) = c("# of int.")
row.names(lap.list) = sub(wd, "", files)
row.names(lap.list) = gsub("([0-9]+).*$", "\\1", rownames(lap.list))
row.names(lap.list) = gsub('/', "", row.names(lap.list), fixed = TRUE)
###########################################################################
#this applies almost the same function as the one listed above except I call
it a different vector name so it can be manipulated
int.duration = lapply(files, function(x) {
dat2 = read.csv(x, header= TRUE)
dat2 = dat2[-c(1),]
dat2 = as.data.frame(dat2)
dat2 = separate(data = dat2, col = dat2, into = c("lap", "duration"), sep =
"\\ ")
dat2$count = 1:nrow(dat2)
y = dat2$count
i= y%%2==0
dat2$interacting = i
int = dat2[which(dat2$interacting == TRUE),]
})
noint.duration = lapply(files, function(x) {
dat2 = read.csv(x, header= TRUE)
dat2 = dat2[-c(1),]
dat2 = as.data.frame(dat2)
dat2 = separate(data = dat2, col = dat2, into = c("lap", "duration"), sep =
"\\ ")
dat2$count = 1:nrow(dat2)
y = dat2$count
i= y%%2==0
dat2$interacting = i
noint = dat2[which(dat2$interacting == FALSE),]
})
###################################################################
#this splits the output time of minutes, seconds, and milliseconds.
#then it combines them into a total seconds.milliseconds readout.
#after that, it takes the sum of the times for each file and combines them
with the total interactions dataframe.
int.duration = melt(int.duration)
int.duration = as.data.frame(int.duration)
int.left = as.data.frame(substr(int.duration$duration, 1, 2))
colnames(int.left) = "min"
int.mid = as.data.frame(substr(int.duration$duration, 4, 4 + 2 - 1))
colnames(int.mid) = "sec"
int.right = as.data.frame(substr(int.duration$duration,
nchar(int.duration$duration) - (2-1), nchar(int.duration$duration)))
colnames(int.right) = "ms"
int.time = cbind(int.left, int.mid, int.right)
int.time$min = as.numeric(as.character(int.time$min))
int.time$sec = as.numeric(as.character(int.time$sec))
int.time$ms = as.numeric(as.character(int.time$ms))
int.time$ms = int.time$ms/100
int.time$min = ifelse(int.time$min > 0, int.time$min*60,0)
int.time$sum = rowSums(int.time)
int.file = as.data.frame(int.duration$L1)
colnames(int.file) = "file"
int.time = cbind(int.time, int.file)
int.tot = as.data.frame(tapply(int.time$sum, int.time$file, sum))
colnames(int.tot) = "int."
social.dat = cbind(lap.list, int.tot)
noint.duration = melt(noint.duration)
noint.duration = as.data.frame(noint.duration)
noint.left = as.data.frame(substr(noint.duration$duration, 1, 2))
colnames(noint.left) = "min"
noint.mid = as.data.frame(substr(noint.duration$duration, 4, 4 + 2 - 1))
colnames(noint.mid) = "sec"
noint.right = as.data.frame(substr(noint.duration$duration,
nchar(noint.duration$duration) - (2-1), nchar(noint.duration$duration)))
colnames(noint.right) = "ms"
noint.time = cbind(noint.left, noint.mid, noint.right)
noint.time$min = as.numeric(as.character(noint.time$min))
noint.time$sec = as.numeric(as.character(noint.time$sec))
noint.time$ms = as.numeric(as.character(noint.time$ms))
noint.time$ms = noint.time$ms/100
noint.time$min = ifelse(noint.time$min > 0, noint.time$min*60,0)
noint.time$sum = rowSums(noint.time)
noint.file = as.data.frame(noint.duration$L1)
colnames(noint.file) = "file"
noint.time = cbind(noint.time, noint.file)
noint.tot = as.data.frame(tapply(noint.time$sum, noint.time$file, sum))
colnames(noint.tot) = "not.int."
social.dat = cbind(social.dat, noint.tot)
social.dat$ID = rownames(social.dat)
Here is and axample of a csv file I am working with. The words are all in the same column and separated by spaces.
Total time 10:00.61
Lap times
01 00:07.46
02 00:05.64
03 00:01.07
04 00:01.04
05 00:04.71
06 00:06.43
07 00:12.52
08 00:07.34
09 00:05.46
10 00:05.81
11 00:05.52
12 00:06.51
13 00:10.75
14 00:00.83
15 00:03.64
16 00:02.75
17 00:01.20
18 00:06.17
19 00:04.40
20 00:00.75
21 00:00.84
22 00:01.29
23 00:02.31
24 00:03.04
25 00:02.85
26 00:05.86
27 00:05.76
28 00:05.06
29 00:00.96
30 00:06.91

#akrun suggested ifelse, which works great for one or two nestings. Much past that, and my personal preference is to use dplyr::case_when or a separate data.frame in a merge/join of sorts.
If you are using the "simple case" of assigning consistently by the same fields (id in this case), then the merge/join is my preferred method: it makes maintenance much simpler (IMO). (When I say "consistently by the same fields", I mean that you could have a id1 and id2 fields by which you define the individual records and their applicable groups. Likely too much for your example, so I'll keep this answer at one key merging.)
Three methods (data far below):
Base R
dat2a <- merge(dat, groups, by="id", all.x=TRUE)
dat2a
# id int group
# 1 1 22 veh
# 2 2 33 thc1
# 3 3 44 <NA>
Note that any id not included in the definition of groups will have NA group. You can assign a default group with this:
dat2a$group[is.na(dat2a$group)] <- "somedefaultgroup"
dat2a
# id int group
# 1 1 22 veh
# 2 2 33 thc1
# 3 3 44 somedefaultgroup
dplyr, merge/join
Similar concept, but using dplyr-esque verbs.
library(dplyr)
dat2c <- left_join(dat, groups, by="id") %>%
mutate(group = if_else(is.na(group), "somedefaultgroup", group))
dplyr::case_when
(This does not use groups as I defined for the merge/join cases.)
In case you really want to do some ladder/nesting of if/else-like statements, case_when is easier to read (and debug) and might be faster, depending on your use-case.
Most direct:
library(dplyr)
dat2b <- dat
dat2b$group <- case_when(
dat2b$id %in% c("1","5") ~ "veh",
dat2b$id %in% c("2","6") ~ "thc1",
TRUE ~ "somedefaultgroup"
)
A little easier to read than the previous by using with(...), but functionally identical. (If your "ladder" is much longer, then code-golf (number of characters in the code) can be significantly reduced.)
dat2b <- dat
dat2b$group <- with(dat2b, case_when(
id %in% c("1","5") ~ "veh",
id %in% c("2","6") ~ "thc1",
TRUE ~ "somedefaultgroup"
))
If you want to use some dplyr verbs, then:
dat2b <- dat
dat2b <- dat2b %>%
mutate(
group = case_when(
id %in% c("1","5") ~ "veh",
id %in% c("2","6") ~ "thc1",
TRUE ~ "somedefaultgroup"
)
)
Data
When doing merge/join actions, it's important to use stringsAsFactors=FALSE so that the absence of factor levels (of the newly-assigned groups) is not a problem. (This can be worked around, but ...)
dat <- data.frame(id=c("1","2","3"), int=c(22L,33L,44L),
stringsAsFactors=FALSE)
Optional use for the merge examples above:
groups <- data.frame(id=c("1","5","2","6"), group=c("veh","veh","thc1","thc1"),
stringsAsFactors=FALSE)
groups
# id group
# 1 1 veh
# 2 5 veh
# 3 2 thc1
# 4 6 thc1
The premise is that you define one row for each unique id.

Thanks to #r2evans the following code worked exactly as I wanted it to (using dplyr::case_when)
social.dat$group = case_when(
social.dat$ID %in% c("1","5") ~ "veh",
social.dat$ID %in% c("2","6") ~ "thc1",
social.dat$ID %in% c("3","7") ~ "thc2",
social.dat$ID %in% c("4","8") ~ "thc3"
)
This was the final output of the dataframe
# of int. int. not.int. ID group
1 50 218.41 372.16 1 veh
3 33 134.94 158.17 3 thc2

Related

Separating multi-valued attributes into individual attributes R

I'm working with the stackoverflow developer survey data-set and attempting to predict compensation based on technologies worked with and collaborative tools worked with. These two attributes are multi-valued with semicolons separating the individual values.
For instance, under the CollabToolsWorkedWith attribute in one row, there is Confluence;Jira;Github;Slack;Microsoft;Teams;Google Suite. I want to give each of these values their own column with a value of either 0 or 1 if the row had that value.
The end result would have each row contain a column for every single value under CollabToolsWorkedWith and each column would contain 0's and 1's based on whether or not the row contained that value.
You may get a quicker answer next time if you provide some sample data that everyone can quickly access. I found the 2020 data online. Here is my answer:
# read the data frame
rm(list = ls())
df <- read.csv("survey_results_public.csv")
# figure out which column you are talking about
data.frame(colnames(df))
table(df$NEWCollabToolsWorkedWith)
# convert to lower case and character
df$NEWCollabToolsWorkedWith <- as.character(df$NEWCollabToolsWorkedWith)
df$NEWCollabToolsWorkedWith <- tolower(df$NEWCollabToolsWorkedWith)
# keep only the useful variables and separate based on ;
library(tidyverse)
library(splitstackshape)
namesdf <- df %>% select(NEWCollabToolsWorkedWith)
namesdf <- cSplit(namesdf,"NEWCollabToolsWorkedWith", sep = ";", direction = "wide", drop=TRUE,
type.convert = TRUE)
# stack stuff on top of each other to find unique list of tools/platforms
long_data_frame <-
namesdf %>%
pivot_longer(cols = starts_with("NEWCollabToolsWorkedWith"), # use columns starting with "year"
names_to ="unique", # name of new column
names_prefix = "_",
values_drop_na = TRUE) %>%
distinct(value)
# clean the variable names
library(janitor)
long_data_frame$value = as.character(long_data_frame$value)
long_data_frame$value = janitor::make_clean_names(long_data_frame$value)
# get final unique list
table(long_data_frame$value)
> table(long_data_frame$value)
confluence facebook_workplace github gitlab
1 1 1 1
google_suite_docs_meet_etc jira microsoft_azure microsoft_teams
1 1 1 1
slack stack_overflow_for_teams trello
1 1 1
# create new variables
df$confluence <- NA
df$jira <- NA
df$slack = NA
df$microsoft_azure =NA
df$trello = NA
df$github = NA
df$gitlab = NA
df$google_suite_docs_meet_etc = NA
df$microsoft_teams = NA
df$stack_overflow_for_teams = NA
df$facebook_workplace =NA
# make a dummy variable based on string match
df$confluence <- as.integer(grepl(pattern = "confluence", x = df$NEWCollabToolsWorkedWith))
df$jira <- as.integer(grepl(pattern = "jira", x = df$NEWCollabToolsWorkedWith))
df$slack <- as.integer(grepl(pattern = "slack", x = df$NEWCollabToolsWorkedWith))
df$microsoft_azure <- as.integer(grepl(pattern = "microsoft azure", x = df$NEWCollabToolsWorkedWith))
df$trello <- as.integer(grepl(pattern = "trello", x = df$NEWCollabToolsWorkedWith))
df$github <- as.integer(grepl(pattern = "github", x = df$NEWCollabToolsWorkedWith))
df$gitlab <- as.integer(grepl(pattern = "gitlab", x = df$NEWCollabToolsWorkedWith))
df$google_suite_docs_meet_etc <- as.integer(grepl(pattern = "google", x = df$NEWCollabToolsWorkedWith))
df$microsoft_teams <- as.integer(grepl(pattern = "microsoft teams", x = df$NEWCollabToolsWorkedWith))
df$stack_overflow_for_teams <- as.integer(grepl(pattern = "overflow", x = df$NEWCollabToolsWorkedWith))
df$facebook_workplace <- as.integer(grepl(pattern = "facebook", x = df$NEWCollabToolsWorkedWith))
# proof that it went through
table(df$facebook_workplace)
> table(df$facebook_workplace)
0 1
62881 1580

How does the table and $freq function work in R

I want a function for the mode of a vector. Abhiroop Sarkar's answer to This question works, but I want to understand why.
Here is the code
Mode <- function(x){
y <- data.frame(table(x))
y[y$Freq == max(y$Freq),1]
}
1) Wy do we need to put the table in a data frame,
2) in this line
y[y$Freq == max(y$Freq),1]
what does the y$Freq do? is frequency a default columns in the table?
When we convert a table output to data.frame, it creates a two column data.frame
set.seed(24)
v1 <- table(sample(1:5, 100, replace = TRUE))
y <- data.frame(v1)
y
# Var1 Freq
#1 1 19
#2 2 24
#3 3 22
#4 4 16
#5 5 19
The first column 'Var1' is the names of the frequency output from table and the 'Freq' is the actual frequency of those names
y[y$Freq == max(y$Freq), 1]
#[1] 2
#Levels: 1 2 3 4 5
Now, we are subsetting the first column 'Var1' based on the max value of 'Freq', and it returns a vector because of the drop = TRUE in [ when there is a single column
If we want to return a data.frame with single, add drop = FALSE at the end
y[y$Freq == max(y$Freq), 1, drop = FALSE]
# Var1
#2 2
Regarding the default name Freq, it is created from the as.data.frame.table method
as.data.frame.table
function (x, row.names = NULL, ..., responseName = "Freq", stringsAsFactors = TRUE,
sep = "", base = list(LETTERS))
{
ex <- quote(data.frame(do.call("expand.grid", c(dimnames(provideDimnames(x,
sep = sep, base = base)), KEEP.OUT.ATTRS = FALSE, stringsAsFactors = stringsAsFactors)),
Freq = c(x), row.names = row.names))
names(ex)[3L] <- responseName
eval(ex)
}

For loop for updating data.frame

I am trying to write a "for loop" to update my R data frame by iterating.
Here is my code:
datalist = list()
for (i in 1:5) {
dat <- data.frame(ID=LETTERS[seq( from = 1, to = 20 )],nutrition=rnorm(20, mean=50, sd=10),
Stage=c(rep("A1",5), rep("B1",15)))
dat$ADG<-dat$nutrition*0.05
dat$M_weight<-dat$nutrition*0.5+dat$ADG*100
dat$Age<-dat$M_weight*1.1+dat$ADG*0.6
dat$Stage<-as.character(dat$Stage)
dat$Stage[dat$ADG>=3]<-"C1"
dat$i <- i # maybe you want to keep track of which iteration produced it?
datalist[[i]] <- dat # add it to your list #
}
big_data = do.call(rbind, datalist)
From iteration 2, I would like to have "Stage" updated to "C1" if ADG is equal or greater than 3 but this would not apply to iteration 1.
Thank you so much! I appreciate any replies!
I think you want a recursive function instead of an iterative one
Your data stringsAsFactors=F
dat <- data.frame(ID=LETTERS[seq( from = 1, to = 20 )], nutrition=rnorm(20, mean=50, sd=10), Stage=c(rep("A1",5), rep("B1",15)), stringsAsFactors=F)
Use tidyverse for dplyr and purrr verbs
library(tidyverse)
special <- function( dat, counter, end ) {
dat1 <- dat %>%
mutate(ADG = nutrition*0.05) %>%
mutate(M_weight = nutrition*0.5 + ADG*100) %>%
mutate(Age = M_weight*1.1 + ADG*0.6) %>%
mutate(Stage = ifelse( ADG >= 3, "C1", Stage )) %>%
mutate(i=counter)
if (counter < end) {
special(dat1, counter+1, end)
} else {
return(dat1)
}
}
desired <- map_df(2:5, ~special(dat,1,.x))
head(desired)
ID nutrition Stage ADG M_weight Age i
1 A 47.17826 A1 2.358913 259.4804 286.8438 2
2 B 64.55988 C1 3.227994 355.0794 392.5241 2
3 C 52.29020 A1 2.614510 287.5961 317.9244 2
4 D 59.96544 A1 2.998272 329.8099 364.5899 2
Let me know if this is not the output you were expecting

Joint Occurrence of variables in R

I want to count individual and combine occurrence of variables (1 represents presence and 0 represents absence). This can be obtained by multiple uses of table function (See MWE below). Is it possible to use a more efficient approach to get the required output given below?
set.seed(12345)
A <- rbinom(n = 100, size = 1, prob = 0.5)
B <- rbinom(n = 100, size = 1, prob = 0.6)
C <- rbinom(n = 100, size = 1, prob = 0.7)
df <- data.frame(A, B, C)
table(A)
A
0 1
48 52
table(B)
B
0 1
53 47
table(C)
C
0 1
34 66
table(A, B)
B
A 0 1
0 25 23
1 28 24
table(A, C)
C
A 0 1
0 12 36
1 22 30
table(B, C)
C
B 0 1
0 21 32
1 13 34
table(A, B, C)
, , C = 0
B
A 0 1
0 8 4
1 13 9
, , C = 1
B
A 0 1
0 17 19
1 15 15
Required Output
I am requiring something like the following:
A = 52
B = 45
C = 66
A + B = 24
A + C = 30
B + C = 34
A + B + C = 15
Expanding on Sumedh's answer, you can also do this dynamically without having to specify the filter every time. This will be useful if you have more than only 3 columns to combine.
You can do something like this:
lapply(seq_len(ncol(df)), function(i){
# Generate all the combinations of i element on all columns
tmp_i = utils::combn(names(df), i)
# In the columns of tmp_i we have the elements in the combination
apply(tmp_i, 2, function(x){
dynamic_formula = as.formula(paste("~", paste(x, "== 1", collapse = " & ")))
df %>%
filter_(.dots = dynamic_formula) %>%
summarize(Count = n()) %>%
mutate(type = paste0(sort(x), collapse = ""))
}) %>%
bind_rows()
}) %>%
bind_rows()
This will:
1) generate all the combinations of the columns of df. First the combinations with one element (A, B, C) then the ones with two elements (AB, AC, BC), etc.
This is the external lapply
2) then for every combination will create a dynamic formula. For AB for instance the formula will be A==1 & B==1, exactly as Sumedh suggested. This is the dynamic_formula bit.
3) Will filter the dataframe with the dynamically generated formula and count the number of rows
4) Bind all together (the two bind_rows)
The output will be
Count type
1 52 A
2 47 B
3 66 C
4 24 AB
5 30 AC
6 34 BC
7 15 ABC
EDITED TO ADD: I see now that you don't want to get the exclusive counts (i.e. A and AB should both include all As).
I got more than a little nerd-sniped by this today, particularly as I wanted to solve it using base R with no packages. The below should do that.
There is a very easy (in principle) solution that simply uses xtabs(), which I've illustrated below. However, to generalize it for any potential number of dimensions, and then to apply it to a variety of combinations, actually was harder. I strove to avoid using the dreaded eval(parse()).
set.seed(12345)
A <- rbinom(n = 100, size = 1, prob = 0.5)
B <- rbinom(n = 100, size = 1, prob = 0.6)
C <- rbinom(n = 100, size = 1, prob = 0.7)
df <- data.frame(A, B, C)
# Turn strings off
options(stringsAsFactors = FALSE)
# Obtain the n-way frequency table
# This table can be directly subset using []
# It is a little tricky to pass the arguments
# I'm trying to avoid eval(parse())
# But still give a solution that isn't bound to a specific size
xtab_freq <- xtabs(formula = formula(x = paste("~",paste(names(df),collapse = " + "))),
data = df)
# Demonstrating what I mean
# All A
sum(xtab_freq["1",,])
# [1] 52
# AC
sum(xtab_freq["1",,"1"])
# [1] 30
# Using lapply(), we pass names(df) to combn() with m values of 1, 2, and 3
# The output of combn() goes through list(), then is unlisted with recursive FALSE
# This gives us a list of vectors
# Each one being a combination in which we are interested
lst_combs <- unlist(lapply(X = 1:3,FUN = combn,x = names(df),list),recursive = FALSE)
# For nice output naming, I just paste the values together
names(lst_combs) <- sapply(X = lst_combs,FUN = paste,collapse = "")
# This is a function I put together
# Generalizes process of extracting values from a crosstab
# It does it in this fashion to avoid eval(parse())
uFunc_GetMargins <- function(crosstab,varvector,success) {
# Obtain the dimname-names (the names within each dimension)
# From that, get the regular dimnames
xtab_dnn <- dimnames(crosstab)
xtab_dn <- names(xtab_dnn)
# Use match() to get a numeric vector for the margins
# This can be used in margin.table()
tgt_margins <- match(x = varvector,table = xtab_dn)
# Obtain a margin table
marginal <- margin.table(x = crosstab,margin = tgt_margins)
# To extract the value, figure out which marginal cell contains
# all variables of interest set to success
# sapply() goes over all the elements of the dimname names
# Finds numeric index in that dimension where the name == success
# We subset the resulting vector by tgt_margins
# (to only get the cells in our marginal table)
# Then, use prod() to multiply them together and get the location
tgt_cell <- prod(sapply(X = xtab_dnn,
FUN = match,
x = success)[tgt_margins])
# Return as named list for ease of stacking
return(list(count = marginal[tgt_cell]))
}
# Doing a call of mapply() lets us get the results
do.call(what = rbind.data.frame,
args = mapply(FUN = uFunc_GetMargins,
varvector = lst_combs,
MoreArgs = list(crosstab = xtab_freq,
success = "1"),
SIMPLIFY = FALSE,
USE.NAMES = TRUE))
# count
# A 52
# B 47
# C 66
# AB 24
# AC 30
# BC 34
# ABC 15
I ditched the prior solution that used aggregate.
Using dplyr,
Occurrence of only A:
library(dplyr)
df %>% filter(A == 1) %>% summarise(Total = nrow(.))
Occurrence of A and B:
df %>% filter(A == 1, B == 1) %>% summarise(Total = nrow(.))
Occurence of A, B, and C
df %>% filter(A == 1, B == 1, C == 1) %>% summarise(Total = nrow(.))

Passing a list of functions to ddply in R

I want to able to apply a list of function to ddply so that the list can expand based on what I want.
Something like this:
func_list = list(Start.Date = "min(Date)", End.Date = "max(Date)")
do.call(ddply, c(list(.data = df, .variables = grps, .fun=summarize), func_list))
When I run that, it calls ddply but then has a variable called Start.Date that equals the string "min(Date)". I've tried not quoting, but that doesn't work either.
Edit (example code and desired results):
library(plyr)
get_summary <- function(grps, func_list, df = raw) {
out <- do.call(ddply,
c(list(.data = df, .variables = grps, .fun=summarize),
func_list))
return(out)
}
raw <- data.frame(date = c(as.Date("2015-5-1"),
as.Date("2015-5-1"),
as.Date("2015-5-2"),
as.Date("2015-5-2")),
count = c(2,4,6,8),
amnt = c(100,200,300,400))
func_list <- list(
total_count = "sum(count)",
avg_amnt = "mean(amnt)"
)
get_summary("date", func_list)
# Desired output:
# date total_count avg_amnt
# 1 2015-05-01 6 150
# 2 2015-05-02 14 350
#
# Which is equivalent to:
# ddply(raw, "date", summarize, total_count = sum(count), avg_amnt = mean(amnt))
To fix your code you only need parse():
func_list <- list(
total_count = parse(text="sum(count)"),
avg_amnt = parse(text="mean(amnt)"))
This will tell the interpreter that the text should be evaluated as code and not as strings.

Resources