I have a dataframe as shown:
structure(list(ID = c(1, 1, 1, 1, 2, 2, 2, 2), ColA = c(2, 3,
4, 5, 2, 3, 4, 5), ColB = c(1, 2, 3, 4, 1, 2, 3, 4), ColA_0.2 = c(2,
3.4, 4.68, 5.936, 2, 3.4, 4.68, 5.936), ColB_0.2 = c(1, 2.2,
3.44, 4.688, 1, 2.2, 3.44, 4.688)), class = "data.frame", row.names = c(NA,
-8L))
What I need ? - For each ID, I want to calculate ColA_ad and ColB_ad. User will pass a parameter 'ad'.
For example - if 'ad' is 0.2 then the values will be calculated as:
First row - same as ColA (i.e. 2)
Second row - Add Second row of ColA to 0.2*First row of ColA_ad (i.e. Sum(3,0.2*2)=3.4)
Third row - Add Third row of ColA to 0.2*second row of ColA_ad (i.e. Sum(4,0.2*3.4)=4.68)
and so on.
The same will be calculated for all other columns (here ColB), which can be mentioned in separate vector.
Summary - I would take 0.2 times carry over effect of previous calculated row and add to new row.
The results are displayed in Column ColA_ad and ColB_ad.
As my dataset is very large, I am looking for data.table solution.
Here is a base R solution, where a linear algebra property is applied to speed up your iterative calculation.
basic idea (taking id = 1 as example)
you first construct a low triangluar matrix for mapping from col to col_ad, i.e.,
l <- 0.2**abs(outer(seq(4),seq(4),"-"))
l[upper.tri(l)] <- 0
which gives
> l
[,1] [,2] [,3] [,4]
[1,] 1.000 0.00 0.0 0
[2,] 0.200 1.00 0.0 0
[3,] 0.040 0.20 1.0 0
[4,] 0.008 0.04 0.2 1
then you use lover columns col, i.e.,
> l %*% as.matrix(subset(df,ID == 1)[-1])
ColA ColB
[1,] 2.000 1.000
[2,] 3.400 2.200
[3,] 4.680 3.440
[4,] 5.936 4.688
code
ad <- 0.2
col_ad <- do.call(rbind,
c(make.row.names = F,
lapply(split(df,df$ID),
function(x) {
l <- ad**abs(outer(seq(nrow(x)),seq(nrow(x)),"-"))
l[upper.tri(l)]<- 0
`colnames<-`(data.frame(l%*% as.matrix(x[-1])),paste0(names(x[-1]),"_",ad))
}
)
)
)
dfout <- cbind(df,col_ad)
such that
> dfout
ID ColA ColB ColA_0.2 ColB_0.2
1 1 2 1 2.000 1.000
2 1 3 2 3.400 2.200
3 1 4 3 4.680 3.440
4 1 5 4 5.936 4.688
5 2 2 1 2.000 1.000
6 2 3 2 3.400 2.200
7 2 4 3 4.680 3.440
8 2 5 4 5.936 4.688
DATA
df <- structure(list(ID = c(1, 1, 1, 1, 2, 2, 2, 2), ColA = c(2, 3,
4, 5, 2, 3, 4, 5), ColB = c(1, 2, 3, 4, 1, 2, 3, 4)), class = "data.frame", row.names = c(NA,
-8L))
A non-recursive option:
setDT(DT)[, paste0(cols,"_",ad) := {
m <- matrix(unlist(shift(ad^(seq_len(.N)-1L), 0L:(.N-1L), fill = 0)), nrow=.N)
lapply(.SD, function(x) c(m%*%x))
}, by = ID, .SDcols = cols]
Another recursive option:
library(data.table)
setDT(DT)[, paste0(cols,"_",ad) := {
a <- 0
b <- 0
.SD[, {
a <- ColA + ad*a
b <- ColB + ad*b
.(a, b)
}, seq_len(.N)][, (1) := NULL]
},
by = ID]
output:
ID ColA ColB ColA_0.2 ColB_0.2
1: 1 2 1 2.000 1.000
2: 1 3 2 3.400 2.200
3: 1 4 3 4.680 3.440
4: 1 5 4 5.936 4.688
5: 2 2 1 2.000 1.000
6: 2 3 2 3.400 2.200
7: 2 4 3 4.680 3.440
8: 2 5 4 5.936 4.688
data:
DT <- structure(list(ID = c(1, 1, 1, 1, 2, 2, 2, 2), ColA = c(2, 3,
4, 5, 2, 3, 4, 5), ColB = c(1, 2, 3, 4, 1, 2, 3, 4), ColA_0.2 = c(2,
3.4, 4.68, 5.936, 2, 3.4, 4.68, 5.936), ColB_0.2 = c(1, 2.2,
3.44, 4.688, 1, 2.2, 3.44, 4.688)), class = "data.frame", row.names = c(NA,
-8L))
ad <- 0.2
cols <- c("ColA", "ColB")
Here is one way with data.table using Reduce:
#Columns to apply function to
cols <- names(df)[2:3]
#Create a function to apply
apply_fun <- function(col, ad) {
Reduce(function(x, y) sum(y, x * ad), col, accumulate = TRUE)
}
library(data.table)
#Convert dataframe to data.table
setDT(df)
#set ad value
ad <- 0.2
#Apply funnction to each columns of cols
df[, (paste(cols, ad, sep = "_")) := lapply(.SD, apply_fun, ad), .SDcols = cols, by = ID]
df
# ID ColA ColB ColA_0.2 ColB_0.2
#1: 1 2 1 2.000 1.000
#2: 1 3 2 3.400 2.200
#3: 1 4 3 4.680 3.440
#4: 1 5 4 5.936 4.688
#5: 2 2 1 2.000 1.000
#6: 2 3 2 3.400 2.200
#7: 2 4 3 4.680 3.440
#8: 2 5 4 5.936 4.688
Related
I have two data frames with equal number of rows. The snippet is
df1 <- data.frame(Id = c(123, 124, 125), X1 = c(1, 2, 1), X2 = c(NA_integer_, 1, 2))
df1
Id X1 X2
1 123 1 NA
2 124 2 1
3 125 1 2
df2 <- data.frame(Id = c(123, 124, 125), X1_1 = c(NA_integer_, 2, 1), X1_2 = c(1, NA_integer_, 2), X1_3 = c(2, 3, 3), X2_1 = c(NA_integer_, 1, NA_integer_), X2_2 = c(NA_integer_, 3, NA_integer_), X2_3 = c(NA_integer_, 2, 2))
df2
Id X1_1 X1_2 X1_3 X2_1 X2_2 X2_3
1 123 NA 1 2 NA NA NA
2 124 2 NA 3 1 3 2
3 125 1 2 3 NA NA 2
I need to create list with arranged df2 elements accordingly group positions from df1 and remove NA:
$`123`
[1] 1 2
$`124`
[1] 1 3 2 2 3
$`125`
[1] 1 2 3 2
How to create this list in R?
UPD
Example of real data is here.
Here is one option
Map(function(u, v) {
nm1 <- sub("_\\d+", "", names(v))
grp <- match(nm1, unique(nm1))
as.vector(na.omit(unlist(split.default(v, grp)[unlist(u)], use.names = FALSE)))
}, split(df1[-1], df1$Id),
split(df2[-1], df2$Id))
#$`123`
#[1] 1 2
#$`124`
#[1] 1 3 2 2 3
#$`125`
#[1] 1 2 3 2
Another option using lapply, sappy, sort and grep:
lapply(setNames(df1$Id, df1$Id), function(i) {
x <- df2[df2$Id==i, unlist(lapply(paste0("^", names(sort(df1[df1$Id==i,-1])), "_"), grep, colnames(df2)))]
x[!is.na(x)]
})
#$`123`
#[1] 1 2
#
#$`124`
#[1] 1 3 2 2 3
#
#$`125`
#[1] 1 2 3 2
I have a data.table similar to the one below, but with around 3 million rows and a lot more columns.
key1 price qty status category
1: 1 9.26 3 5 B
2: 1 14.64 1 5 B
3: 1 16.66 3 5 A
4: 1 18.27 1 5 A
5: 2 2.48 1 7 A
6: 2 0.15 2 7 C
7: 2 6.29 1 7 B
8: 3 7.06 1 2 A
9: 3 24.42 1 2 A
10: 3 9.16 2 2 C
11: 3 32.21 2 2 B
12: 4 20.00 2 9 B
Heres the dput() string
dados = structure(list(key1 = c(1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4),
price = c(9.26, 14.64, 16.66, 18.27, 2.48, 0.15, 6.29, 7.06,
24.42, 9.16, 32.21, 20), qty = c(3, 1, 3, 1, 1, 2, 1, 1,
1, 2, 2, 2), status = c(5, 5, 5, 5, 7, 7, 7, 2, 2, 2, 2,
9), category = c("B", "B", "A", "A", "A", "C", "B", "A",
"A", "C", "B", "B")), .Names = c("key1", "price", "qty",
"status", "category"), row.names = c(NA, -12L), class = c("data.table",
"data.frame"), .internal.selfref = <pointer: 0x0000000004720788>)
I need to transform this data so that I have one entry for each key, and on the proccess I need to create some additional variables. So far I was using this:
Mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
key.aggregate = function(x){
return(data.table(
key1 = Mode(x$key1),
perc.A = sum(x$price[x$category == "A"],na.rm=T)/sum(x$price),
perc.B = sum(x$price[x$category == "B"],na.rm=T)/sum(x$price),
perc.C = sum(x$price[x$category == "C"],na.rm=T)/sum(x$price),
status = Mode(x$status),
qty = sum(x$qty),
price = sum(x$price)
))
}
new_data = split(dados,by = "key1") #Runs out of RAM here
results = rbindlist(lapply(new_data,key.aggregate))
And expecting the following output:
> results
key1 perc.A perc.B perc.C status qty price
1: 1 0.5937447 0.4062553 0.00000000 5 8 58.83
2: 2 0.2780269 0.7051570 0.01681614 7 4 8.92
3: 3 0.4321208 0.4421414 0.12573782 2 6 72.85
4: 4 0.0000000 1.0000000 0.00000000 9 2 20.00
But I'm always running out of RAM when splitting the data by keys. I've tried using only a third of the data, and now only a sixth of it but it still gives the same Error: cannot allocate vector of size 593 Kb.
I'm thinking this approach is very inefficient, which would be the best way to get this result?
Hello I have one question of matching two data.frames.
Consider I have two datasets:
Dataframe 1:
"A" "B"
91 1
92 3
93 11
94 4
95 10
96 6
97 7
98 8
99 9
100 2
structure(list(A = 91:100, B = c(1, 3, 11, 4, 10, 6, 7, 8, 9,
2)), .Names = c("A", "B"), row.names = c(NA, -10L), class = "data.frame")
Dataframe 2:
"C" "D"
91.12 1
92.34 3
93.65 11
94.23 4
92.14 10
96.98 6
97.22 7
98.11 8
93.15 9
100.67 2
91.45 1
96.45 3
83.78 11
84.66 4
100 10
structure(list(C = c(91.12, 92.34, 93.65, 94.23, 92.14, 96.98,
97.22, 98.11, 93.15, 100.67, 91.25, 96.45, 83.78, 84.66, 100),
D = c(1, 3, 11, 4, 10, 6, 7, 8, 9, 2, 1, 3, 11, 4, 10)), .Names = c("C",
"D"), row.names = c(NA, -15L), class = "data.frame")
Now I want to find the rounded matches between column A and C and replace column D by the respective value in column B from Dataframe 1. Where there is no corresponding value (by rounded matches between A and C) I want to get an NaN for the replaced column D.
result:
"C" "newD"
91.12 1
92.34 3
93.65 4
94.23 4
92.14 3
96.98 7
97.22 7
98.11 8
93.15 11
100.67 NaN
91.25 1
96.45 6
83.78 NaN
84.66 NaN
100 2
structure(list(C = c(91.12, 92.34, 93.65, 94.23, 92.14, 96.98,
97.22, 98.11, 93.15, 100.67, 91.25, 96.45, 83.78, 84.66, 100),
D = c(1, 3, 4, 4, 3, 7, 7, 8, 11, NaN, 1, 6, NaN, NaN, 2)), .Names = c("C",
"D"), row.names = c(NA, -15L), class = "data.frame")
Does anybody knows how to do that especially for large datasets?
Thanks a lot!
Making an update join with data.table:
library(data.table)
setDT(DF1); setDT(DF2)
DF2[, A := round(C)]
DF2[, D := DF1[DF2, on=.(A), x.B] ]
# alternately, chain together in one step:
DF2[, A := round(C)][, D := DF1[DF2, on=.(A), x.B] ]
This gives NAs in unmatched rows. To switch it... DF2[is.na(D), D := NaN].
To drop the new DF2$A column, use DF2[, A := NULL].
Does anybody knows how to do that especially for large datasets?
This modifies DF2 in place (instead of making a new table like a vanilla join as in Mike's answer), so it should be fairly efficient for large tables. It might perform better if A is stored as an integer instead of a float in both tables.
On data.table 1.9.6, use on="A", B instead of on=.(A), x.B. Thanks to Mike H for checking this.
You can create a lookup table where the values in A are used to look up the values in B.
Lookup = df1$B
names(Lookup) = df1$A
df3 = data.frame(C = df2$C, newD = Lookup[as.character(round(df2$C))])
df3$newD[is.na(df3$newD)] = NaN
For these types of merges I like sql:
library(sqldf)
res <- sqldf("SELECT l.C, r.B
FROM df2 as l
LEFT JOIN df1 as r
on round(l.C) = round(r.A)")
res
# C B
#1 91.12 1
#2 92.34 3
#3 93.65 4
#4 94.23 4
#5 92.14 3
#6 96.98 7
#7 97.22 7
#8 98.11 8
#9 93.15 11
#10 100.67 NA
#11 91.45 1
#12 96.45 6
#13 83.78 NA
#14 84.66 NA
#15 100.00 2
I have a data.frame where I'd like to remove entire groups if any of their members meets a condition.
In this first example, if the values are numbers and the condition is NA the code below works.
df <- structure(list(world = c(1, 2, 3, 3, 2, NA, 1, 2, 3, 2), place = c(1,
1, 2, 2, 3, 3, 1, 2, 3, 1), group = c(1, 1, 1, 2, 2, 2, 3,
3, 3, 3)), .Names = c("world", "place", "group"), row.names = c(NA,
-10L), class = "data.frame")
ans <- ddply(df, . (group), summarize, code=mean(world))
ans$code[is.na(ans$code)] <- 0
ans2 <- merge(df,ans)
final.ans <- ans2[ans2$code !=0,]
However, this ddply maneuver with the NA values will not work if the condition is something other than "NA", or if the value are non-numeric.
For example, if I wanted to remove groups which have one or more rows with a world value of AF (as in the data frame below) this ddply trick would not work.
df2 <-structure(list(world = structure(c(1L, 2L, 3L, 3L, 3L, 5L, 1L,
4L, 2L, 4L), .Label = c("AB", "AC", "AD", "AE", "AF"), class = "factor"),
place = c(1, 1, 2, 2, 3, 3, 1, 2, 3, 1), group = c(1,
1, 1, 2, 2, 2, 3, 3, 3, 3)), .Names = c("world", "place",
"group"), row.names = c(NA, -10L), class = "data.frame")
I can envision a for-loop where for each group the value of each member is checked, and if the condition is met a code column could be populated, and then a subset could me made based on that code.
But, perhaps there is a vectorized, r way to do this?
Using dplyr:
library(dplyr)
df2 %>%
group_by(group) %>%
filter(!any(world == "AF"))
Or as per motioned by #akrun using data.table:
library(data.table)
setDT(df2)[, if(!any(world == "AF")) .SD, group]
#or
setDT(df2)[, if(all(world != "AF")) .SD, group]
Which gives:
#Source: local data frame [7 x 3]
#Groups: group
#
# world place group
#1 AB 1 1
#2 AC 1 1
#3 AD 2 1
#4 AB 1 3
#5 AE 2 3
#6 AC 3 3
#7 AE 1 3
Alternate data.table solution:
setDT(df2)
df2[!(group %in% df2[world == "AF",group])]
gives:
world place group
1: AB 1 1
2: AC 1 1
3: AD 2 1
4: AB 1 3
5: AE 2 3
6: AC 3 3
7: AE 1 3
Using keys we can be a bit faster:
setkey(df2,group)
df2[!J((df2[world == "AF",group]))]
base package:
df2[ df2$group != df2[ df2$world == 'AF', "group" ], ]
Output:
world place group
1 AB 1 1
2 AC 1 1
3 AD 2 1
7 AB 1 3
8 AE 2 3
9 AC 3 3
10 AE 1 3
Using sqldf:
library(sqldf)
sqldf("SELECT df2.world, df2.place, [group] FROM df2
LEFT JOIN
(SELECT * FROM df2 WHERE world LIKE 'AF') AS t
USING([group])
WHERE t.world IS NULL")
Output:
world place group
1 AB 1 1
2 AC 1 1
3 AD 2 1
4 AB 1 3
5 AE 2 3
6 AC 3 3
7 AE 1 3
Base R option using ave
df2[with(df2, ave(world != "AF", group, FUN = all)),]
# world place group
#1 AB 1 1
#2 AC 1 1
#3 AD 2 1
#7 AB 1 3
#8 AE 2 3
#9 AC 3 3
#10 AE 1 3
Or we can also use subset
subset(df2, ave(world != "AF", group, FUN = all))
The above can also be written as
df2[with(df2, !ave(world == "AF", group, FUN = any)),]
and
subset(df2, !ave(world == "AF", group, FUN = any))
I have a data.frame with column headers.
How can I get a specific row from the data.frame as a list (with the column headers as keys for the list)?
Specifically, my data.frame is
A B C
1 5 4.25 4.5
2 3.5 4 2.5
3 3.25 4 4
4 4.25 4.5 2.25
5 1.5 4.5 3
And I want to get a row that's the equivalent of
> c(a=5, b=4.25, c=4.5)
a b c
5.0 4.25 4.5
x[r,]
where r is the row you're interested in. Try this, for example:
#Add your data
x <- structure(list(A = c(5, 3.5, 3.25, 4.25, 1.5 ),
B = c(4.25, 4, 4, 4.5, 4.5 ),
C = c(4.5, 2.5, 4, 2.25, 3 )
),
.Names = c("A", "B", "C"),
class = "data.frame",
row.names = c(NA, -5L)
)
#The vector your result should match
y<-c(A=5, B=4.25, C=4.5)
#Test that the items in the row match the vector you wanted
x[1,]==y
This page (from this useful site) has good information on indexing like this.
Logical indexing is very R-ish. Try:
x[ x$A ==5 & x$B==4.25 & x$C==4.5 , ]
Or:
subset( x, A ==5 & B==4.25 & C==4.5 )
Try:
> d <- data.frame(a=1:3, b=4:6, c=7:9)
> d
a b c
1 1 4 7
2 2 5 8
3 3 6 9
> d[1, ]
a b c
1 1 4 7
> d[1, ]['a']
a
1 1
If you don't know the row number, but do know some values then you can use subset
x <- structure(list(A = c(5, 3.5, 3.25, 4.25, 1.5 ),
B = c(4.25, 4, 4, 4.5, 4.5 ),
C = c(4.5, 2.5, 4, 2.25, 3 )
),
.Names = c("A", "B", "C"),
class = "data.frame",
row.names = c(NA, -5L)
)
subset(x, A ==5 & B==4.25 & C==4.5)
10 years later ---> Using tidyverse we could achieve this simply and borrowing a leaf from Christopher Bottoms. For a better grasp, see slice().
library(tidyverse)
x <- structure(list(A = c(5, 3.5, 3.25, 4.25, 1.5 ),
B = c(4.25, 4, 4, 4.5, 4.5 ),
C = c(4.5, 2.5, 4, 2.25, 3 )
),
.Names = c("A", "B", "C"),
class = "data.frame",
row.names = c(NA, -5L)
)
x
#> A B C
#> 1 5.00 4.25 4.50
#> 2 3.50 4.00 2.50
#> 3 3.25 4.00 4.00
#> 4 4.25 4.50 2.25
#> 5 1.50 4.50 3.00
y<-c(A=5, B=4.25, C=4.5)
y
#> A B C
#> 5.00 4.25 4.50
#The slice() verb allows one to subset data row-wise.
x <- x %>% slice(1) #(n) for the nth row, or (i:n) for range i to n, (i:n()) for i to last row...
x
#> A B C
#> 1 5 4.25 4.5
#Test that the items in the row match the vector you wanted
x[1,]==y
#> A B C
#> 1 TRUE TRUE TRUE
Created on 2020-08-06 by the reprex package (v0.3.0)