How to update values from another table with conditions - r

I want to update values from table df1 with values from df2, only updating null values or zeros.
I can do it with data.table or dplyr, but I can´t automate to all columns.
#data.table
df1 <- data.frame(x1=1:4, x2=c('a','b', NA, 'd'), x3=c(0,0,2,2), stringsAsFactors=FALSE)
df2 <- data.frame(x1=2:3, x2=c("zz", "qq"),x3=6:7, stringsAsFactors=FALSE)
require(data.table)
setDT(df1); setDT(df2)
df1[df2, on = .(x1), x2 := ifelse(is.na(x2) | x2 == 0 ,i.x2,x2)]
#dplyr
require(dplyr)
require(dplyr)
inner_join(df1,df2,by = c("x1" = "x1")) %>%
transmute(x1 = x1,
x2 =ifelse(is.na(x2.x) | x2.x == 0,x2.y,x2.x),
x3 =ifelse(is.na(x3.x) | x3.x == 0,x3.y,x3.x))
With dplyr at least I can manually adding columns getting the expected output, the problem is real dataframe has so much columns. Therefore I want to iterate across columns to achieved the task.
What I´ve tried:
# dplyr + apply
inner_join(df1,df2,by = c("x1" = "x1")) %>%
cbind(.$x1,
apply(.[-1],2, function(cname) ifelse(is.na(cname) | cname == 'b',paste(cname, ".x", collapse = ""),paste(cname, ".y", collapse = "")))
)
# data.table with for
for (cname in names(df1)[!names(df1) %in% c("x1")]) {
df1[i = df2, on = .(x1), j = cname := {function (x) ifelse(is.na(x) | x == 'b',i.x,x)} (cname)
, with = FALSE]
}
# data.table + lapply
df1[i = df2, on = .(x1) ,names(df1)[!names(df1) %in% c("x1")] := lapply(df1[,names(df1)[!names(df1) %in% c("x1")],with=FALSE],
function(x) ifelse(is.na(x) | x == 0,df2.x,df1.x))]

Using base R, you can create a function to replace NA and 0 with corresponding values from another column
replace_na_0 <- function(x) {
ifelse(is.na(x[[1]]) | x[[1]] == 0,x[[2]],x[[1]])
}
Do merge and pass group of columns by removing their postfix (.x, .y) to replace_na_0 function
temp_df <- merge(df1, df2, by = "x1")
cbind(temp_df[1], sapply(split.default(temp_df[-1],
sub("\\..*", "", names(temp_df)[-1])), replace_na_0))
# x1 x2 x3
#1 2 b 6
#2 3 qq 2

For data.table, you can use:
for (x in setdiff(names(df1), "x1")) {
df1[is.na(get(x)) ! get(x)==0, (x) := df2[.SD, on=.(x1), get(x)]]
}

Here is a pure data.table approach...
The melting process takes care of all columns you wish to 'fill, putting them all in one single set of columns (variable and value).
Then fill in all the 0/NA values using an update join (=fast!)
Finally, recast everything back to it's original shape.
library(data.table)
#set to data.table
setDT(df1)
setDT(df2)
#melt to long
melt1 <- melt(df1, id.vars = "x1" )
melt2 <- melt(df2, id.vars = "x1" )
#join all values with value NA or 0
melt1[ is.na(value) | value == 0,
value := melt1[ is.na( value) | value == 0,][ melt2, value := i.value, on = .(x1, variable) ]$value][]
#cast to original wide format
dcast( melt1, x1 ~ variable )
output
# x1 x2 x3
# 1: 1 a 0
# 2: 2 b 6
# 3: 3 qq 2
# 4: 4 d 2

Related

How to group by pivot table with BREAKDOWN in same table data.table R

Data.table has various way to make a pivot table with by = function,
but how can we group the breakdown information in the SAME group by shape ?
Sample Data
# DT
DT <- data.table(GROUP = c("A_GROUP","B_GROUP","B_GROUP","B_GROUP","A_GROUP",
"A_GROUP","B_GROUP","B_GROUP","B_GROUP","A_GROUP"),
TYPE = c("A","B","C","D","E",
"B","B","A","A","E"),
AMOUNT =c(123,1424,1244,2111,44559,
128,1221,12144,11,439))
Separate Table but not grouped in one frame
# ALL
ALL_G <- DT[,.(SUM = format(sum(AMOUNT),big.mark=",")),by = TYPE]
# A_GROUP Breakdown 1
A_G <- DT[grepl("A_GROUP",GROUP),.(SUM =format(sum(AMOUNT),big.mark=",")),by = TYPE]
# B_GROUP Breakdown 2
B_G <- DT[grepl("B_GROUP",GROUP),.(SUM = format(sum(AMOUNT),big.mark=",")),by = TYPE]
Desire shape
# TARGET
TYPE ALL SUM A_GROUP_SUM B_GROUP_SUM
A 12,278 123 12,155
B 2,773 128 2,645
C 1,244 0 1,244
D 2,111 0 2,111
E 44,998 44998 0
How can i achieve this ?
library( data.table)
# sample data
DT <- data.table(GROUP = c("A_GROUP","B_GROUP","B_GROUP","B_GROUP","A_GROUP",
"A_GROUP","B_GROUP","B_GROUP","B_GROUP","A_GROUP"),
TYPE = c("A","B","C","D","E",
"B","B","A","A","E"),
AMOUNT =c(123,1424,1244,2111,44559,
128,1221,12144,11,439))
#create a dt for the sum by TYPE
dt1 <- DT[, list( ALL_SUM = sum( AMOUNT ) ), by = "TYPE" ]
#create a dt for the sum by TYPE and GROUP
dt2 <- DT[, list( sum = sum( AMOUNT ) ), by = c( "TYPE", "GROUP" )]
#rename the groups to the desired column names
dt2[, GROUP := paste0( GROUP, "_SUM" )]
#cast to wide format
dt2 <- dcast( dt2, TYPE ~ GROUP, value.var = "sum", fill = 0 )
# option 1: join together (you can use setcolorder() afterwards to get the desired order of columns)
dt2[dt1, on = "TYPE"]
#option 2: bind together (drop the first colum of dt2, oly works of both dt's have the same number of rows)
cbind( dt1, dt2[, -1] )
# TYPE ALL_SUM A_GROUP_SUM B_GROUP_SUM
# 1 A 12278 123 12155
# 2 B 2773 128 2645
# 3 C 1244 0 1244
# 4 D 2111 0 2111
# 5 E 44998 44998 0

Pattern Searching in R

I have two data frames as below. DF1 is slighly messy (as you can see below) has multiple values from DF2 combined into one column.
DF1
SRNo. Value
1 1ABCD2EFGH3IJKL
2 1ABCD2EFGH3IJKL/7MLPO0OKMN8MNBV
3 3ABCD4EFGH5IJKL
4 3ABCD4EFGH5IJKL/1ABCD2EFGH3IJKL
5 7MLPO0OKMN8MNBV/9IUYT7HGFD3LKJH
DF2
SRNo. Value
1 1ABCD2EFGH3IJKL
2 3ABCD4EFGH5IJKL
3 6PQRS7TUVW8XYZA
4 5FGHI9XUZX1RATP
5 9AGTY6UGFW0AAUU
6 6TEYD7RARA8MHAT
7 9IUYT7HGFD3LKJH
I want to do a look up using values column in both the data set. Here is what I am trying to accomplish.
i) For rows 1 & 3 in DF1 it is a simple look up in DF2. I expect the code to return those looked up values.
ii) For row #3 in DF1, only first part of the string matches with a value in DF2. I expect the code to return only the first part.
iii) For row#4 in DF1, both the parts in the string matches with values in DF2. In this case I want the first part of the string that is matching to be retained
iv) For Row #5, the second part in the string matches with the value in DF2. I would expect the code to return the 2nd part of the string.
I have around 47000 rows in first dataset and over 300,000 in second dataset and ofcourse there are other columns in both the datasets. I have tried this in multiple ways using str_split/str_match but could not accomplish what I want to. Every suggestion is appreciated. My rest of the coding is in R.
Thank You
First step is to tidyr::separate() your DF1 at "/". Then I used dplyr::case_when() to see if there was a match between the first of the listed items in DF2 with %in%; if there wasn't then check against the second. I used dplyr::mutate() to append the results to DF1 under dat.
library(dplyr)
library(tidyr)
DF1 <- data.frame("SRNo." = 1:5, Value = c("1ABCD2EFGH3IJKL","1ABCD2EFGH3IJKL/7MLPO0OKMN8MNBV","3ABCD4EFGH5IJKL","3ABCD4EFGH5IJKL/1ABCD2EFGH3IJKL","7MLPO0OKMN8MNBV/9IUYT7HGFD3LKJH"), stringsAsFactors = F) %>% tbl_df()
DF2 <- data.frame("SRNo." = 1:7, Value = c("1ABCD2EFGH3IJKL","3ABCD4EFGH5IJKL","6PQRS7TUVW8XYZA","5FGHI9XUZX1RATP","9AGTY6UGFW0AAUU","6TEYD7RARA8MHAT","9IUYT7HGFD3LKJH"), stringsAsFactors = F) %>%tbl_df()
DF1 %>%
separate(Value, c("Value1", "Value2"), sep = "/") %>%
mutate(dat = case_when(
Value1 %in% DF2$Value ~ Value1,
Value2 %in% DF2$Value ~ Value2,
TRUE ~ NA_character_
))
# # A tibble: 5 x 4
# SRNo. Value1 Value2 dat
# <int> <chr> <chr> <chr>
# 1 1 1ABCD2EFGH3IJKL NA 1ABCD2EFGH3IJKL
# 2 2 1ABCD2EFGH3IJKL 7MLPO0OKMN8MNBV 1ABCD2EFGH3IJKL
# 3 3 3ABCD4EFGH5IJKL NA 3ABCD4EFGH5IJKL
# 4 4 3ABCD4EFGH5IJKL 1ABCD2EFGH3IJKL 3ABCD4EFGH5IJKL
# 5 5 7MLPO0OKMN8MNBV 9IUYT7HGFD3LKJH 9IUYT7HGFD3LKJH
Data.table solution
df1 <- read.table(text="SRNo. Value
1 1ABCD2EFGH3IJKL
2 1ABCD2EFGH3IJKL/7MLPO0OKMN8MNBV
3 3ABCD4EFGH5IJKL
4 3ABCD4EFGH5IJKL/1ABCD2EFGH3IJKL
5 7MLPO0OKMN8MNBV/9IUYT7HGFD3LKJH", header = T, stringsAsFactors = F)
df2 <- read.table( text = "SRNo. Value
1 1ABCD2EFGH3IJKL
2 3ABCD4EFGH5IJKL
3 6PQRS7TUVW8XYZA
4 5FGHI9XUZX1RATP
5 9AGTY6UGFW0AAUU
6 6TEYD7RARA8MHAT
7 9IUYT7HGFD3LKJH", header = T, stringsAsFactors = F )
library( data.table )
setDT(df1)[, c( "Value1", "Value2" ) := tstrsplit( Value, "/", fixed = TRUE)]
setDT(df2)
resultv1 <- df2[ df1, on = c( Value = "Value1"), nomatch = 0L ]
resultv2 <- df2[ df1, on = c( Value = "Value2"), nomatch = 0L ]
result <- rbindlist( list( resultv1, resultv2 ) )[!duplicated( i.SRNo.)]
Benchmarking it against the solution from #Paul shows similar runtimes (~2.5 miliseconds).. But data.table sometimes surprises me on larger data-sets..
If memory becomes an issue, you can do it all in one go:
rbindlist( list( setDT(df2)[ setDT(df1)[, c( "Value1", "Value2" ) := tstrsplit( Value, "/", fixed = TRUE)],
on = c( Value = "Value1"), nomatch = 0L ],
setDT(df2)[ setDT(df1)[, c( "Value1", "Value2" ) := tstrsplit( Value, "/", fixed = TRUE)],
on = c( Value = "Value2"), nomatch = 0L ] ) )[!duplicated( i.SRNo.)]

Removing rows where multiple columns equal an exact number R

I'd like to subset rows where x1 and x2 == 9. My real set has over 200 columns where the column name starts with the same string. The dummy code below creates a smaller sample of the data. I'd like to do this ideally with the R data.table package if possible.
df <- data.frame('id'=c(1,2,3), 'x1'=c(9,9,4), 'x2'=c(9,9,4))
head(df)
# does not work, but thought perhaps I could have defined the columns via a paste and then subset where columns were equal to 9.
df[which(paste0("x", 1:2)==9), ]
Update: sorry if I wasn't clear. I am aware of simply adding a filter for x1 and x2. The issue is that the real data consists of over 200 columns: x1:x200. I am in search of a cleaner solution than what is proposed below.
If you want an efficient base R solution I would simply use rowSums, e.g.
cols <- paste0("x", 1:2)
df[rowSums(df[cols] == 9) == length(cols), ]
# id x1 x2
# 1 1 9 9
# 2 2 9 9
If you want a data.table solution, I would use a binary join, e.g.
library(data.table)
setDT(df)[as.list(rep(9, length(cols))), on = cols]
# id x1 x2
# 1: 1 9 9
# 2: 2 9 9
Data
df <- data.frame(id = 1:3, x1 = c(9, 9, 4), x2 = c(9, 9, 4))
Something like this, perhaps?
df[apply(df[, paste0("x", 1:200)] == 9, 1, all), ]
A melt can allow you to not have to write out every column (for your >2 column case):
> aTbl = as.data.table(df)
> aTbl[, all9sP := F]
> aTbl[, .SD
][, !'all9sP'
][, melt(.SD, id.vars=c('id'))
][, NVars := uniqueN(variable)
][value == 9
][, .(N9s=.N), .(id, NVars)
][, all9sP := N9s == NVars
][, aTbl[.SD, all9sP := i.all9sP, on=.(id)]
][all9sP == T
][, all9sP := NULL
][, .SD
]
id x1 x2
1: 1 9 9
2: 2 9 9
>
Try:
df[df$x1 == 9 & df$x2 == 9,]
EDIT (misunderstood, now it should do the trick):
for (i in 2:200) {df = df[df[,i] == 9,]}
You could also use grep with apply
# Select all columns that have (colnames) "x"
col.names <- grep("x",colnames(df), value = TRUE)
# Select rows where row == 9
sel <- apply(df[,col.names], 1, function(row) 9 %in% row)
df[sel,]
And the output
id x1 x2
1 1 9 9
2 2 9 9
Solution using data.table
Create dataset
ncols <- 5
cnms <- paste0("x", 1:ncols)
X <- data.table(ID = 1:1e6)
X[, (cnms) := NA_integer_]
X[, (cnms) := lapply(X = 1:ncols, sample, size = .N, x = 1:10)]
Find rows where sum equals 9
X1 <- X[, s := rowSums(.SD), .SDcols = cnms][s == 9, ][, s:= NULL][]
X1
Find rows where all columns are equal to 9
X[, s := NULL]
ind <- rowSums(X[, lapply(.SD, is.element, set = 9), .SDcols = cnms])
X2 <- X[ind == length(cnms)][]
X2
Edit
This is acutally a lot faster:
X[, s := NULL]
ind <- rowSums(X[, .SD , .SDcols = cnms] == 9)
X2 <- X[ind == length(cnms)][]
X2
Edit2
See answer from https://stackoverflow.com/users/3001626/david-arenburg. A lot faster.
In the tidyverse, try rowwise and use filter as usual
df %>%
rowwise() %>%
filter(x1 %in% 9 & x2 %in% 9 )
Source: local data frame [2 x 3]
Groups: <by row>
# A tibble: 2 x 3
id x1 x2
<dbl> <dbl> <dbl>
1 1 9 9
2 2 9 9

Transpose whole dataframe into one row dataframe- (or transposing each row of data.table and column binding)

I have tried to transform my_dataset with the help of library reshape & data.table in order to achieve the result.dataset but haven't been successful as yet.
I have a data table my_dataset that looks like this :-
A X Count
id1 b 1
id1 c 2
And I want to have the result.dataset that should look like this :-
A X1 Count1 X2 Count2
id1 b 1 c 2
It would be great if anyone could help me to get the result.dataset as above, preferably by using reshape or data.table (or both lib).
Here's a solution that is using only reshape2 (trying to stick to the suggested packages). It starts by adding a column rep, that allows one to call dcast.
require(reshape2)
#adding rep
my_dataset$rep = unlist(tapply(my_dataset$A, my_dataset$A, function(x)1:length(x)))
#cast at work
C1 = dcast(my_dataset, A ~ paste('X',rep, sep=''), value.var='X')
C2 = dcast(my_dataset, A ~ paste('Count',rep, sep=''), value.var='Count')
result.dataset = cbind(C1, C2[,-1])
The columns will not be in the same order as your example though.
Try this:
dt <- read.table(text = 'A X Count
id1 b 1
id1 c 2',header=T)
a <- aggregate(.~A, dt, paste, collapse=",")
library(splitstackshape)
result <- concat.split.multiple(data = a, split.cols = c("X","Count"), seps = ",")
output:
> result
A X_1 X_2 Count_1 Count_2
1: id1 b c 1 2
We can aggregate the rows and use cSplit to split them.
library(data.table)
library(splitstackshape)
dat2 <- setDT(dat)[, lapply(.SD, paste, collapse = ","), by = A]
cols <- c(names(dat[, 1]), paste(names(dat[, -1]),
rep(1:nrow(dat), each = nrow(dat),
sep = "_"))
cSplit(dat2, splitCols = names(dat[, -1]))[, cols, with = FALSE]
# A X_1 Count_1 X_2 Count_2
# 1: id1 b 1 c 2
DATA
dat <- read.table(text = "A X Count
id1 b 1
id1 c 2",
header = TRUE, stringsAsFactors = FALSE)

R / data.table() merge on named subset of another data.table

I'm trying to put together several files and need to do a bunch of merges on column names that are created inside a loop. I can do this fine using data.frame() but am having issues using similar code with a data.table():
library(data.table)
df1 <- data.frame(id = 1:20, col1 = runif(20))
df2 <- data.frame(id = 1:20, col1 = runif(20))
newColNum <- 5
newColName <- paste('col',newColNum ,sep='')
df1[,newColName] <- runif(20)
df2 <- merge(df2, df1[,c('id',newColName)], by = 'id', all.x = T) # Works fine
######################
dt1 <- data.table(id = 1:20, col1 = runif(20))
dt2 <- data.table(id = 1:20, col1 = runif(20))
newColNum <- 5
newColName <- paste('col',newColNum ,sep='')
dt1[,newColName] <- runif(20)
dt2 <- merge(dt2, dt1[,c('id',newColName)], by = 'id', all.x = T) # Doesn't work
Any suggestions?
This really has nothing to do with merge(), and everything to do with how the j (i.e. column) index is, by default, interpreted by [.data.table().
You can make the whole statement work by setting with=FALSE, which causes the j index to be interpreted as it would be in a data.frame:
dt2 <- merge(dt2, dt1[,c('id',newColName), with=FALSE], by = 'id', all.x = T)
head(dt2, 3)
# id col1 col5
# 1: 1 0.4954940 0.07779748
# 2: 2 0.1498613 0.12707070
# 3: 3 0.8969374 0.66894157
More precisely, from ?data.table:
with: By default 'with=TRUE' and 'j' is evaluated within the frame
of 'x'. The column names can be used as variables. When
'with=FALSE', 'j' is a vector of names or positions to
select.
Note that this could be avoided by storing the columns in a variable like so:
cols = c('id', newColName)
dt1[ , ..cols]
.. signals to "look up one level"
Try dt1[,list(id,get(newColName))] in your merge.

Resources