data.table manipulation and merging - r

I have data
dat1 <- data.table(id=1:8,
group=c(1,1,2,2,2,3,3,3),
value=c(5,6,10,11,12,20,21,22))
dat2 <- data.table(group=c(1,2,3),
value=c(3,6,13))
and I would like to subtract dat2$value from each of the dat1$value, based on group.
Is this possible using data.table or does it require additional packages?

With data.table, you could do:
library(data.table)
dat1[dat2, on = "group"][, new.value := value - i.value, by = "group"][]
Which returns:
id group value i.value new.value
1: 1 1 5 3 2
2: 2 1 6 3 3
3: 3 2 10 6 4
4: 4 2 11 6 5
5: 5 2 12 6 6
6: 6 3 20 13 7
7: 7 3 21 13 8
8: 8 3 22 13 9
Alternatively, you can do this in one step as akrun mentions:
dat1[dat2, newvalue := value - i.value, on = .(group)]
id group value newvalue
1: 1 1 5 2
2: 2 1 6 3
3: 3 2 10 4
4: 4 2 11 5
5: 5 2 12 6
6: 6 3 20 7
7: 7 3 21 8
8: 8 3 22 9

Related

R DataTable Solution Fast Reshape

data1=data.frame("StudentID"=c(1,2,3,4,5),
"a1cat"=c(9,10,2,0,10),
"a2cat"=c(0,2,8,6,7),
"a3cat"=c(4,2,1,6,5),
"a1dog"=c(8,4,4,5,8),
"a2dog"=c(1,9,10,5,7),
"a3dog"=c(9,3,2,7,7),
"q20fox"=c(2,8,6,1,9),
"q22fox"=c(8,10,9,6,6),
"q24fox"=c(5,0,2,9,7))
data2=data.frame("StudentID" = sort(rep(1:5,each=3)),
"timeX" = c(1,2,3,1,2,3,1,2,3,1,2,3,1,2,3),
"meow" = c(9,0,4,10,2,2,2,8,1,0,6,6,10,7,5),
"bark" = c(8,1,9,4,9,3,4,10,2,5,5,7,8,7,7),
"woof"=c(2,8,5,8,10,0,6,9,2,1,6,9,9,6,7))
I have 'data1' and wish to get 'data2' using data.table to reshape the data and give new names for each column.
data1x=data.frame("StudentID"=c(1,2,3,4,5),
"a1cat"=c(9,10,2,0,10),
"a2cat"=c(0,2,8,6,7),
"a3cat"=c(4,2,1,6,5),
"a1dog"=c(8,4,4,5,8),
"a2dog"=c(1,9,10,5,7),
"a3dog"=c(9,3,2,7,7),
"fox20"=c(2,8,6,1,9),
"fox22"=c(8,10,9,6,6),
"fox24"=c(5,0,2,9,7))
We can use melt with measure patterns
library(data.table)
melt(setDT(data1), measure = patterns("cat$", "dog$", "fox\\d*$"),
value.name = c("meow", "bark", "woof"),
variable.name = 'timeX')[order(StudentID)]
# StudentID timeX meow bark woof
# 1: 1 1 9 8 2
# 2: 1 2 0 1 8
# 3: 1 3 4 9 5
# 4: 2 1 10 4 8
# 5: 2 2 2 9 10
# 6: 2 3 2 3 0
# 7: 3 1 2 4 6
# 8: 3 2 8 10 9
# 9: 3 3 1 2 2
#10: 4 1 0 5 1
#11: 4 2 6 5 6
#12: 4 3 6 7 9
#13: 5 1 10 8 9
#14: 5 2 7 7 6
#15: 5 3 5 7 7

Spread data table by id

I have the following data.table:
> df
month student A B C D
1: 1 Amy 9 6 1 11
2: 1 Bob 8 5 5 2
3: 2 Amy 7 7 2 4
4: 2 Bob 6 6 6 6
5: 3 Amy 6 8 10 7
6: 3 Bob 9 7 11 3
I want to transform this data.table to this format: > df1
month cols Amy Bob
1: 1 A 9 8
2: 1 B 6 5
3: 1 C 1 5
4: 1 D 11 2
5: 2 A 7 6
6: 2 B 7 6
7: 2 C 2 6
8: 2 D 4 6
9: 3 A 6 9
10: 3 B 8 7
11: 3 C 10 11
12: 3 D 7 3
I tried multiple ways using dcast etc. but I couldn't transform the data. Help please!
You have to melt the dataframe and then dcast -
tmp = melt(df, id = c("month", "student"), , variable.name = "cols")
df1 = dcast(tmp, month + cols ~ student, value.var = "value")
Both are from the data.table library
A tidyr approach.
> library(tidyr)
> df %>%
gather(cols, values, A:D) %>%
spread(student, values)
month cols Amy Bob
1 1 A 9 8
2 1 B 6 5
3 1 C 1 5
4 1 D 11 2
5 2 A 7 6
6 2 B 7 6
7 2 C 2 6
8 2 D 4 6
9 3 A 6 9
10 3 B 8 7
11 3 C 10 11
12 3 D 7 3

Shifting row values by lag value in another column

I have a rather large dataset and I am interested in "marching" values forward through time based on values from another column. For example, if I have a Value = 3 at Time = 0 and a DesiredShift = 2, I want the 3 to shift down two rows to be at Time = 2. Here is a reproducible example.
Build reproducible fake data
library(data.table)
set.seed(1)
rowsPerID <- 8
dat <- CJ(1:2, 1:rowsPerID)
setnames(dat, c("ID","Time"))
dat[, Value := rpois(.N, 4)]
dat[, Shift := sample(0:2, size=.N, replace=TRUE)]
Fake Data
# ID Time Value Shift
# 1: 1 1 3 2
# 2: 1 2 3 2
# 3: 1 3 4 1
# 4: 1 4 7 2
# 5: 1 5 2 2
# 6: 1 6 7 0
# 7: 1 7 7 1
# 8: 1 8 5 0
# 9: 2 1 5 0
# 10: 2 2 1 1
# 11: 2 3 2 0
# 12: 2 4 2 1
# 13: 2 5 5 2
# 14: 2 6 3 1
# 15: 2 7 5 1
# 16: 2 8 4 1
I want each Value to shift forward according the the Shift column. So the
DesiredOutput column for row 3 will be equal to 3 since the value at Time=1 is
Value = 3 and Shift = 2.
Row 4 shows 3+4=7 since 3 shifts down 2 and 4 shifts down 1.
I would like to be able to do this by ID group and hopefully take advantage
of data.table since speed is of interest for this problem.
Desired Result
# ID Time Value Shift DesiredOutput
# 1: 1 1 3 2 NA
# 2: 1 2 3 2 NA
# 3: 1 3 4 1 3
# 4: 1 4 7 2 3+4 = 7
# 5: 1 5 2 2 NA
# 6: 1 6 7 0 7+7 = 14
# 7: 1 7 7 1 2
# 8: 1 8 5 0 7+5 = 12
# 9: 2 1 5 0 5
# 10: 2 2 1 1 NA
# 11: 2 3 2 0 1+2 = 3
# 12: 2 4 2 1 NA
# 13: 2 5 5 2 2
# 14: 2 6 3 1 NA
# 15: 2 7 5 1 3+5=8
# 16: 2 8 4 1 5
I was hoping to get this working using the data.table::shift function, but I am unsure how to make this work using multiple lag parameters.
Try this:
dat[, TargetIndex:= .I + Shift]
toMerge = dat[, list(Out = sum(Value)), by='TargetIndex']
dat[, TargetIndex:= .I]
# dat = merge(dat, toMerge, by='TargetIndex', all=TRUE)
dat[toMerge, on='TargetIndex', DesiredOutput:= i.Out]
> dat
# ID Time Value Shift TargetIndex DesiredOutput
# 1: 1 1 3 2 1 NA
# 2: 1 2 3 2 2 NA
# 3: 1 3 4 1 3 3
# 4: 1 4 7 2 4 7
# 5: 1 5 2 2 5 NA
# 6: 1 6 7 0 6 14
# 7: 1 7 7 1 7 2
# 8: 1 8 5 0 8 12
# 9: 2 1 5 0 9 5
# 10: 2 2 1 1 10 NA
# 11: 2 3 2 0 11 3
# 12: 2 4 2 1 12 NA
# 13: 2 5 5 2 13 2
# 14: 2 6 3 1 14 NA
# 15: 2 7 5 1 15 8
# 16: 2 8 4 1 16 5

How to refer to a column of a data.table by its position, in a sum() statement

I've googled the issue as many ways as my brain is capable of and I still can't find the answer. I'm new to R so there are some things that confuse me a little bit.
Let's say I have a data table like this:
x y z 100 200 300
1: 1 1 a 1 1 1
2: 1 1 b 2 3 4
3: 1 2 c 3 5 7
4: 1 2 d 4 7 0
5: 2 1 e 5 9 3
6: 2 1 f 6 1 6
7: 2 2 g 7 3 9
8: 2 2 h 8 5 2
This can be created with this piece of code:
DT = setDT(structure(list(c(1, 1, 1, 1, 2, 2, 2, 2),
c(1, 1, 2, 2, 1, 1, 2, 2),
c("a","b","c","d","e","f","g","h"),
c(1,2,3,4,5,6,7,8),
c(1,3,5,7,9,1,3,5),
c(1,4,7,0,3,6,9,2)),
.Names = c("x", "y", "z", 100, 200, 300), row.names = c(NA, -8L), class = "data.frame"))
However, in my actual code, the last three columns were auto-generated using another function (dcast), so the total number of columns of the data.table is not static. Also, you may notice that the names of those three last columns are numeric, which might be a problem at some point.
What I need is to create one aditional column for each "extra" column (the ones right after column "z"). I need the code to work such as this example: first, it creates column "100s", then for each row, it calculates the sum of column "100", considering only the rows with the same combination of x,y that the row in question. And so on for "200s" and "300s". Like this:
x y z 100 200 300 100s 200s 300s
1: 1 1 a 1 1 1 3 4 5
2: 1 1 b 2 3 4 3 4 5
3: 1 2 c 3 5 7 7 12 7
4: 1 2 d 4 7 0 7 12 7
5: 2 1 e 5 9 3 11 10 9
6: 2 1 f 6 1 6 11 10 9
7: 2 2 g 7 3 9 15 8 11
8: 2 2 h 8 5 2 15 8 11
I've tried with several modifications of this idea of a code:
for (i in 3:(dim(DT)[2])) {
DT <- DT[,paste(colnames(DT)[i], "s", sep=""):=sum(i),
by=c("x","y")]
}
This gives me the following result:
x y z 100 200 300 100s 200s 300s
1: 1 1 a 1 1 1 4 5 6
2: 1 1 b 2 3 4 4 5 6
3: 1 2 c 3 5 7 4 5 6
4: 1 2 d 4 7 0 4 5 6
5: 2 1 e 5 9 3 4 5 6
6: 2 1 f 6 1 6 4 5 6
7: 2 2 g 7 3 9 4 5 6
8: 2 2 h 8 5 2 4 5 6
Of course, R is not recognizing the numeric value of i as the number of column it should consider for the sum, as instead it's taking it as a raw number. I can't figure out how to adress a specific column by its position, because when it comes to sum(), that "with=FALSE" thing fails to save the day.
Any help will be appreciated.
There is no need for using a for loop in this case to get the desired result. You can update DT by reference with:
DT[, paste0(colnames(DT)[3:5],'s') := lapply(.SD, sum), by = .(x,y)]
which will give you the desired result:
> DT
x y 100 200 300 100s 200s 300s
1: 1 1 1 1 1 3 4 5
2: 1 1 2 3 4 3 4 5
3: 1 2 3 5 7 7 12 7
4: 1 2 4 7 0 7 12 7
5: 2 1 5 9 3 11 10 9
6: 2 1 6 1 6 11 10 9
7: 2 2 7 3 9 15 8 11
8: 2 2 8 5 2 15 8 11
When you don't know exacly which columns to sum, you could use one of the following methods:
# method 1:
DT[, paste0(colnames(DT)[3:ncol(DT)],'s') := lapply(.SD, sum), by = .(x,y)]
# method 2:
DT[, paste0(setdiff(colnames(DT), c('x','y')),'s') := lapply(.SD, sum), by = .(x,y)]
With the updated example, probably the best way to do is:
cols <- setdiff(colnames(DT), c('x','y','z'))
DT[, paste0(cols,'s') := lapply(.SD, sum), by = .(x,y), .SDcols = cols]
which gives:
> DT
x y z 100 200 300 100s 200s 300s
1: 1 1 a 1 1 1 3 4 5
2: 1 1 b 2 3 4 3 4 5
3: 1 2 c 3 5 7 7 12 7
4: 1 2 d 4 7 0 7 12 7
5: 2 1 e 5 9 3 11 10 9
6: 2 1 f 6 1 6 11 10 9
7: 2 2 g 7 3 9 15 8 11
8: 2 2 h 8 5 2 15 8 11

Number of copies (duplicates) in R data.table

I want to add a column to a data.table which shows how many copies of each row exist. Take the following example:
library(data.table)
DT <- data.table(id = 1:10, colA = c(1,1,2,3,4,5,6,7,7,7), colB = c(1,1,2,3,4,5,6,7,8,8))
setkey(DT, colA, colB)
DT[, copies := length(colA), by = .(colA, colB)]
The output it gives is
id colA colB copies
1: 1 1 1 1
2: 2 1 1 1
3: 3 2 2 1
4: 4 3 3 1
5: 5 4 4 1
6: 6 5 5 1
7: 7 6 6 1
8: 8 7 7 1
9: 9 7 8 1
10: 10 7 8 1
Desired output is:
id colA colB copies
1: 1 1 1 2
2: 2 1 1 2
3: 3 2 2 1
4: 4 3 3 1
5: 5 4 4 1
6: 6 5 5 1
7: 7 6 6 1
8: 8 7 7 1
9: 9 7 8 2
10: 10 7 8 2
How should I do it?
I also want to know why my approach doesn't. work. Isn't it true that when you group by colA and colB, the first group should contain two rows of data? I understand if "length" is not the function to use, but I cannot think of any other function to use. I thought of "nrow" but what can I pass to it?
DT[, copies := .N, by=.(colA,colB)]
# id colA colB copies
# 1: 1 1 1 2
# 2: 2 1 1 2
# 3: 3 2 2 1
# 4: 4 3 3 1
# 5: 5 4 4 1
# 6: 6 5 5 1
# 7: 7 6 6 1
# 8: 8 7 7 1
# 9: 9 7 8 2
# 10: 10 7 8 2
As mentioned in the comments, .N will calculate the length of the grouped object as defined in the by argument.

Resources