Aggregate Function - Keep NAs in data.frame - r

I want to use the aggregation function of R to aggregate a Price on several fields. However, I also have NAs in my data, which I would like to keep.
Tried:
> dput(df)
structure(list(ID = c(1L, 2L, 3L, 4L, 4L, 1L, 2L, 3L, 4L, 1L,
2L, 3L, 4L, 3L, 2L, 1L), REFERENCE = c("TEST1", "TEST2", "TEST3",
"TEST4", "TEST1", "TEST2", "TEST3", "TEST4", "TEST1", "TEST2",
"TEST3", "TEST4", "TEST1", "TEST2", "", "TEST2"), ISS = c(1234L,
1234L, 1111L, 1111L, 1234L, 1111L, 1234L, 1111L, 1234L, NA, 1234L,
1111L, 1234L, 1111L, 1234L, NA), Price = c(10L, NA, 20L, NA,
10L, 12L, NA, 99L, 100L, NA, 100L, 12L, NA, 11L, 0L, 12L)), .Names = c("ID",
"REFERENCE", "ISS", "Price"), row.names = c(NA, -16L), class = c("data.table",
"data.frame"), .internal.selfref = <pointer: 0x0000000000100788>)
>
> df <- aggregate(df$Price, by=list(ID=df$ID, REFERENCE=df$REFERENCE, ISS=df$ISS), FUN=sum)
Setting na.action = na.pass, gives me:7
Error in aggregate.data.frame(as.data.frame(x), ...) :
no rows to aggregate
As a result I would like to have:
Hence, I would like to keep my NA Data in my df.
Any recommendation how to implement that?
I appreciate your replies!

Instead of using aggregate on a "data.table", we can use the data.table methods. We get the sum of Price (sum(Price, na.rm=TRUE)) after grouping by "ID/REFERENCE/ISS" (by=list(ID, REFERENCE, ISS)]. Order the output by "ID", "REFERENCE" (if needed)
library(data.table)
df[, sum(Price, na.rm=TRUE), by = list(ID, REFERENCE, ISS)][
order(ID, REFERENCE)]
# ID REFERENCE ISS V1
#1: 1 TEST1 1234 10
#2: 1 TEST2 1111 12
#3: 1 TEST2 NA 12
#4: 2 1234 0
#5: 2 TEST2 1234 0
#6: 2 TEST3 1234 100
#7: 3 TEST2 1111 11
#8: 3 TEST3 1111 20
#9: 3 TEST4 1111 111
#10: 4 TEST1 1234 110
#11: 4 TEST4 1111 0

Related

Replace all values in dataframe using another dataframe as key in R

I have two dataframes and I want to replace all values ( in all the columns) of df1 using the equivalent value in df2 (df2$value).
df1
structure(list(Cell_ID = c(7L, 2L, 3L, 10L), n_1 = c(0L, 0L,
0L, 0L), n_2 = c(9L, 1L, 4L, 1L), n_3 = c(10L, 4L, 5L, 2L), n_4 = c(NA,
5L, NA, 4L), n_5 = c(NA, 7L, NA, 6L), n_6 = c(NA, 9L, NA, 8L),
n_7 = c(NA, 10L, NA, 3L)), class = "data.frame", row.names = c(NA,
-4L))
df2
structure(list(Cell_ID = 0:10, value = c(5L, 100L, 200L, 300L,
400L, 500L, 600L, 700L, 800L, 900L, 1000L)), class = "data.frame", row.names = c(NA,
-11L))
The desired output would look like this:
So far I tried this as suggested in another similar post but its not doing it well (randomly missing some points)
key= df2$Cell_ID
value = df2$value
lapply(1:8,FUN = function(i){df1[df1 == key[i]] <<- value[i]})
Note that the numbers have been just multiplied by 10 for ease in the example the real data has numbers are all over the place so just multiplying the dataframe by 10 won't work.
An option is match the elements with the 'Cell_ID' of second dataset and use that as index to return the corresponding 'value' from 'df2'
library(dplyr)
df1 %>%
mutate(across(everything(), ~ df2$value[match(., df2$Cell_ID)]))
-output
# Cell_ID n_1 n_2 n_3 n_4 n_5 n_6 n_7
#1 700 5 900 1000 NA NA NA NA
#2 200 5 100 400 500 700 900 1000
#3 300 5 400 500 NA NA NA NA
#4 1000 5 100 200 400 600 800 300
Or another option is to use a named vector to do the match
library(tibble)
df1 %>%
mutate(across(everything(), ~ deframe(df2)[as.character(.)]))
The base R equivalent is
df1[] <- lapply(df1, function(x) df2$value[match(x, df2$Cell_ID)])

How to aggregate a data frame by columns and rows?

I have the following data set:
Class Total AC Final_Coverage
A 1000 1 55
A 1000 2 66
B 1000 1 77
A 1000 3 88
B 1000 2 99
C 1000 1 11
B 1000 3 12
B 1000 4 13
B 1000 5 22
C 1000 2 33
C 1000 3 44
C 1000 4 55
C 1000 5 102
A 1000 4 105
A 1000 5 109
I would like to get the average of the AC and the Final_Coverage for the first three rows of each class. Then, I want to store the average values along with the class name in a new dataframe. To do that, I did the following:
dataset <- read_csv("/home/ad/Desktop/testt.csv")
classes <- unique(dataset$Class)
new_data <- data.frame(Class = character(0), AC = numeric(0), Coverage = numeric(0))
for(class in classes){
new_data$Class <- class
dataClass <- subset(dataset, Class == class)
tenRows <- dataClass[1:3,]
coverageMean <- mean(tenRows$Final_Coverage)
acMean <- mean(tenRows$AC)
new_data$Coverage <- coverageMean
new_data$AC <- acMean
}
Everything works fine except entering the average value into the new_data frame. I get the following error:
Error in `$<-.data.frame`(`*tmp*`, "Class", value = "A") :
replacement has 1 row, data has 0
Do you know how to solve this?
This should get you the new dataframe by using dplyr.
dataset %>% group_by(Class) %>% slice(1:3) %>% summarise(AC= mean(AC),
Coverage= mean(Final_Coverage))
In your method the error is that you initiated your new dataframe with 0 rows and try to assign a single value to it. This is reflected by the error. You want to replace one row to a dataframe with 0 rows. This would work, though:
new_data <- data.frame(Class = classes, AC = NA, Coverage = NA)
for(class in classes){
new_data$Class <- class
dataClass <- subset(dataset, Class == class)
tenRows <- dataClass[1:3,]
coverageMean <- mean(tenRows$Final_Coverage)
acMean <- mean(tenRows$AC)
new_data$Coverage[classes == class] <- coverageMean
new_data$AC[classes == class] <- acMean
}
You could look into aggregate().
> aggregate(df1[df1$AC <= 3, 3:4], by=list(Class=df1[df1$AC <= 3, 1]), FUN=mean)
Class AC Final_Coverage
1 A 2 69.66667
2 B 2 62.66667
3 C 2 29.33333
DATA
df1 <- structure(list(Class = structure(c(1L, 1L, 2L, 1L, 2L, 3L, 2L,
2L, 2L, 3L, 3L, 3L, 3L, 1L, 1L), .Label = c("A", "B", "C"), class = "factor"),
Total = c(1000L, 1000L, 1000L, 1000L, 1000L, 1000L, 1000L,
1000L, 1000L, 1000L, 1000L, 1000L, 1000L, 1000L, 1000L),
AC = c(1L, 2L, 1L, 3L, 2L, 1L, 3L, 4L, 5L, 2L, 3L, 4L, 5L,
4L, 5L), Final_Coverage = c(55L, 66L, 77L, 88L, 99L, 11L,
12L, 13L, 22L, 33L, 44L, 55L, 102L, 105L, 109L)), class = "data.frame", row.names = c(NA,
-15L))

R: aggregate values on a tree

This question is similar to this, but it's got a C# answer, and I need a R answer.
I have some 50 files of about 650 rows with a format and data very similar to this toy data:
dput(y)
structure(list(level1 = c(4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L), level2 = c(NA, 41L, 41L, 41L, 41L, 41L, 41L, 41L,
42L, 42L, 42L, 42L), level3 = c(NA, NA, 4120L, 4120L, 4120L,
4120L, 4120L, 4120L, NA, 4210L, 4210L, 4210L), level4 = c(NA,
NA, NA, 412030L, 412030L, 412050L, 412050L, 412050L, NA, NA,
421005L, 421005L), pid = c(NA, NA, NA, NA, 123456L, NA, 789012L,
345678L, NA, NA, NA, 901234L), description = c("income", "op.income",
"manuf.industries", "manuf 1", "client 1", "manuf 2", "client 2",
"client 3", "non-op.income", "financial", "interest", "bank 1"
), value = c(NA, NA, NA, NA, 15000L, NA, 272860L, 1150000L, NA,
NA, NA, 378L)), .Names = c("level1", "level2", "level3", "level4",
"pid", "description", "value"), class = c("data.table", "data.frame"
), row.names = c(NA, -12L), .internal.selfref = <pointer: 0x00000000001a0788>)
Each of the rows that have a value on value are a "leaf" o a tree, with branches identified in columns level1 to 4. I want to summarize the leafs by brach and put the corresponding values in the value column.
My expected output looks like this:
dput(res)
structure(list(level1 = c(4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L), level2 = c(NA, 41L, 41L, 41L, 41L, 41L, 41L, 41L,
42L, 42L, 42L, 42L), level3 = c(NA, NA, 4120L, 4120L, 4120L,
4120L, 4120L, 4120L, NA, 4210L, 4210L, 4210L), level4 = c(NA,
NA, NA, 412030L, 412030L, 412050L, 412050L, 412050L, NA, NA,
421005L, 421005L), pid = c(NA, NA, NA, NA, 123456L, NA, 789012L,
345678L, NA, NA, NA, 901234L), description = c("income", "op.income",
"manuf.industries", "manuf 1", "client 1", "manuf 2", "client 2",
"client 3", "non-op.income", "financial", "interest", "bank 1"
), value = c(1438238L, 1437860L, 1437860L, 15000L, 15000L, 1422860L,
272860L, 1150000L, 378L, 378L, 378L, 378L)), .Names = c("level1",
"level2", "level3", "level4", "pid", "description", "value"), class = c("data.table",
"data.frame"), row.names = c(NA, -12L), .internal.selfref = <pointer: 0x00000000001a0788>)
I know this can be done with a for-loop, but I wanted to know if there is any faster, simpler alternative (I prefer data.table or base-solutions, but any other package works ok too). What I've tried so far:
z4<-y[!is.na(pid),sum(value),by=level4]
setkey(y,"level4");setkey(z4,"level4")
y[z4,][is.na(pid)]
This shows me the desired values in V1, so I wanted to see if I could assign them to value:
y[z4,][is.na(pid),value:=i.V1]
Error in eval(expr, envir, enclos) : object 'i.V1' not found
I think this could be caused because the call i.V1 is in the chained [ and not in the initial y[z4 call. But if I only subset on z4, how can I know which of the several matching level4 rows I should assign (that's why I'm thinking of using is.na(pid), because y[z4,value:=i.V1] produces the wrong result, as it updates all values that match level4).
As you can see, I'm badly stuck at this problem, and with "my method" I still would have 3 more levels to go.
Is there any easier way to do this?
Because the computations at each level require those from the previous level, I think a loop or recursion is required. Here is a recursive function to get the values using base R. You could surely do something similar with data.table, which would probably be much more efficient.
## Use y as data.frame
y <- as.data.frame(y)
## Recursive function to get values
f <- function(data, lvl=NULL) {
if (is.null(lvl)) lvl <- 1 # initialize level
if (lvl == 5) return (data) # we are done
cname <- paste0("level", lvl) # name of current level
nname <- ifelse (lvl == 4, "pid", paste0("level", lvl+1)) # name of next level
agg <- aggregate(as.formula(paste("value~", cname)), data=data, sum) # aggregate data
inds <- (ms <- match(data[,cname], agg[,cname], F)) & is.na(data[,nname]) # find index of leaves to fill
data$value[inds] <- agg$value[ms[inds]] # add new values
f(data, lvl+1) # recurse
}
f(data=y)
# level1 level2 level3 level4 pid description value
# 1 4 NA NA NA NA income 1438238
# 2 4 41 NA NA NA op.income 1437860
# 3 4 41 4120 NA NA manuf.industries 1437860
# 4 4 41 4120 412030 NA manuf 1 15000
# 5 4 41 4120 412030 123456 client 1 15000
# 6 4 41 4120 412050 NA manuf 2 1422860
# 7 4 41 4120 412050 789012 client 2 272860
# 8 4 41 4120 412050 345678 client 3 1150000
# 9 4 42 NA NA NA non-op.income 378
# 10 4 42 4210 NA NA financial 378
# 11 4 42 4210 421005 NA interest 378
# 12 4 42 4210 421005 901234 bank 1 378
I think the aggregation step could be made more efficient by only aggregating a subset of the data if need be. Honestly, this was fun, but a loop is probably the way to go.

R- merge two dataframes but values of ID have semicolons

This is a followup question to:
R- merge two data frames but some values have semi colon in them
which has been addressed by contributor: agstudy.
The actual data discussed in the link is a bit more complex and i have been stuck for a while.
This is what my dataframe (df2) looks like:
myIDColumn someName somevalue
AB gsdfg 123
CD tfgsdfg 234
EF sfdgsf 365
GH gdfgb 53453
IJ sr 64564
KL sfsdv 4234234
MN ewrwe 5
OP dsfsss 3453
QR gggg 667
ST dss 7567
UV hhhhjf 55
WX dfadasad 8657
YZ ghfgh 1234
ABC gdgfg 234455
VCB hgjkk 5555667
This is what my df1 looks like:
ID someText someThing
AB ada 12
CD;EF;QR dfsdf 13
IJ fgfgd 14
KL fgdg 15
MN gh 16
OP;WX jhjhj 17
WW ghjgjhgjghj 18
YZ kkl 19
This is what i was hoping to get as an output:
I can merge the two well by using:
mm <- merge(df2,df1,by.y='ID',by.x='myIDColumn',all.y=TRUE)
but after that no idea how to proceed further.
Any help is really appreciated. Thanks.
df1:
structure(list(ID = structure(1:8, .Label = c("AB", "CD;EF;QR",
"IJ", "KL", "MN", "OP;WX", "WW", "YZ"), class = "factor"), someText = structure(c(1L,
2L, 4L, 3L, 5L, 7L, 6L, 8L), .Label = c("ada", "dfsdf", "fgdg",
"fgfgd", "gh", "ghjgjhgjghj", "jhjhj", "kkl"), class = "factor"),
someThing = 12:19), .Names = c("ID", "someText", "someThing"
), class = "data.frame", row.names = c(NA, -8L))
df2:
structure(list(myIDColumn = structure(c(1L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 11L, 12L, 14L, 15L, 2L, 13L), .Label = c("AB", "ABC",
"CD", "EF", "GH", "IJ", "KL", "MN", "OP", "QR", "ST", "UV", "VCB",
"WX", "YZ"), class = "factor"), someName = structure(c(9L, 15L,
12L, 5L, 14L, 13L, 4L, 2L, 7L, 3L, 11L, 1L, 8L, 6L, 10L), .Label = c("dfadasad",
"dsfsss", "dss", "ewrwe", "gdfgb", "gdgfg", "gggg", "ghfgh",
"gsdfg", "hgjkk", "hhhhjf", "sfdgsf", "sfsdv", "sr", "tfgsdfg"
), class = "factor"), somevalue = c(123L, 234L, 365L, 53453L,
64564L, 4234234L, 5L, 3453L, 667L, 7567L, 55L, 8657L, 1234L,
234455L, 5555667L)), .Names = c("myIDColumn", "someName", "somevalue"
), class = "data.frame", row.names = c(NA, -15L))
There are probably better ways to do it but you could create a temporary dataframe:
df1 <- structure(list(ID = c("AB", "CD;EF;QR", "IJ", "KL", "MN", "OP;WX",
"WW", "YZ"), someText = c("ada", "dfsdf", "fgfgd", "fgdg", "gh",
"jhjhj", "ghjgjhgjghj", "kkl"), someThing = 12:19), .Names = c("ID",
"someText", "someThing"), class = "data.frame", row.names = c(NA,
-8L))
df2 <- structure(list(myIDColumn = c("AB", "CD", "EF", "GH", "IJ", "KL",
"MN", "OP", "QR", "ST", "UV", "WX", "YZ", "ABC", "VCB"), someName = c("gsdfg",
"tfgsdfg", "sfdgsf", "gdfgb", "sr", "sfsdv", "ewrwe", "dsfsss",
"gggg", "dss", "hhhhjf", "dfadasad", "ghfgh", "gdgfg", "hgjkk"
), somevalue = c(123L, 234L, 365L, 53453L, 64564L, 4234234L,
5L, 3453L, 667L, 7567L, 55L, 8657L, 1234L, 234455L, 5555667L)), .Names = c("myIDColumn",
"someName", "somevalue"), class = "data.frame", row.names = c(NA,
-15L))
f <- function(x) {
y <- unlist(strsplit(x$ID,';'))
data.frame(ID = x$ID, someText = x$someText, someThing = x$someThing, ID1 = y)
}
library(plyr)
df3 <- ddply(df1, .(ID), f)
> df3
ID someText someThing ID1
1 AB ada 12 AB
2 CD;EF;QR dfsdf 13 CD
3 CD;EF;QR dfsdf 13 EF
4 CD;EF;QR dfsdf 13 QR
5 IJ fgfgd 14 IJ
6 KL fgdg 15 KL
7 MN gh 16 MN
8 OP;WX jhjhj 17 OP
9 OP;WX jhjhj 17 WX
10 WW ghjgjhgjghj 18 WW
11 YZ kkl 19 YZ
You could merge this with your dataframe df2 and summarize the data:
mm <- merge(df2,df3,by.y='ID1',by.x='myIDColumn',all.y=TRUE)
ddply(mm, .(ID,someText, someThing), summarize,
somevalue = paste(somevalue, collapse=','),
someName = paste(someName, collapse = ","))
ID someText someThing somevalue someName
1 AB ada 12 123 gsdfg
2 CD;EF;QR dfsdf 13 234,365,667 tfgsdfg,sfdgsf,gggg
3 IJ fgfgd 14 64564 sr
4 KL fgdg 15 4234234 sfsdv
5 MN gh 16 5 ewrwe
6 OP;WX jhjhj 17 3453,8657 dsfsss,dfadasad
7 WW ghjgjhgjghj 18 NA NA
8 YZ kkl 19 1234 ghfgh

Re-align column in data frame into multiple columns

I'm trying to change a data frame column (var3, in the example below) that has multiple values for factor levels of another variable (names, in the example below). I'd like var3 to be split into separate columns, one for each value, so that the factor levels in names do not repeat. My other variables (var1, var2) repeat where necessary to provide space for var3.
This is the kind of data I have:
df1 <- structure(list(name = structure(c(2L, 4L, 4L, 4L, 3L, 5L, 5L,
1L), .Label = c("fifth", "first", "fourth", "second", "third"
), class = "factor"), var1 = c(90L, 84L, 84L, 84L, 18L, 22L,
22L, 36L), var2 = c(301L, 336L, 336L, 336L, 412L, 296L, 296L,
357L), var3 = c(-0.582075925, -1.108889624, -1.014962009, -0.162309524,
-0.282309524, 0.563055819, -0.232075925, -0.773353424)), .Names = c("name",
"var1", "var2", "var3"), class = "data.frame", row.names = c(NA, -8L))
This is what i'd like:
df2 <- structure(list(name = structure(c(2L, 4L, 3L, 5L, 1L), .Label = c("fifth",
"first", "fourth", "second", "third"), class = "factor"), var1 = c(90L,
84L, 18L, 22L, 36L), var2 = c(301L, 336L, 412L, 296L, 357L),
var3 = c(-0.582075925, -1.108889624, -0.282309524, 0.563055819,
-0.773353424), var3.2 = c(NA, -1.014962009, NA, -0.232075925,
NA), var3.3 = c(NA, -0.162309524, NA, NA, NA)), .Names = c("name", "var1",
"var2", "var3", "var3.2", "var3.3"), class = "data.frame", row.names = c(NA, -5L))
I've looked at reshape and ddply, but can't get them to give me this output.
Here's a base solution:
> df1$seqnam <- ave(as.character(df1$name), df1$name, FUN=seq) # creates a "time" index
> reshape(df1, direction="wide", timevar="seqnam", idvar=c("name", "var1", "var2") )
name var1 var2 var3.1 var3.2 var3.3
1 first 90 301 -0.5820759 NA NA
2 second 84 336 -1.1088896 -1.0149620 -0.1623095
5 fourth 18 412 -0.2823095 NA NA
6 third 22 296 0.5630558 -0.2320759 NA
8 fifth 36 357 -0.7733534 NA NA
ddply(df1, .(name), function(x) {
var3 <- data.frame(rbind(unique(x$var3)))
names(var3) <- paste0("var3.", 1:length(var3))
return(data.frame(name = unique(x$name), var1 = unique(x$var1),
var2 = unique(x$var2), var3))
})
name var1 var2 var3.1 var3.2 var3.3
1 fifth 36 357 -0.7733534 NA NA
2 first 90 301 -0.5820759 NA NA
3 fourth 18 412 -0.2823095 NA NA
4 second 84 336 -1.1088896 -1.0149620 -0.1623095
5 third 22 296 0.5630558 -0.2320759 NA
The function can be modified if you expect var1 and var2 to also contain multiple values.

Resources